301 files changed, 9359 insertions, 4629 deletions
diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
deleted file mode 100644
index 557d86aa8376..000000000000
--- a/test/CodeGen/R600/128bit-kernel-args.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
-
-; R600: {{^}}v4i32_kernel_arg:
-; R600-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
-; R600-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
-; R600-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
-; R600-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
-; SI: {{^}}v4i32_kernel_arg:
-; SI: buffer_store_dwordx4
-define void @v4i32_kernel_arg(<4 x i32> addrspace(1)* %out, <4 x i32>  %in) {
-entry:
-  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; R600: {{^}}v4f32_kernel_arg:
-; R600-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
-; R600-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
-; R600-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
-; R600-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
-; SI: {{^}}v4f32_kernel_arg:
-; SI: buffer_store_dwordx4
-define void @v4f32_kernel_arg(<4 x float> addrspace(1)* %out, <4 x float>  %in) {
-entry:
-  store <4 x float> %in, <4 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
index 71940fd88f26..c7bcfd2ddab2 100644
--- a/test/CodeGen/R600/32-bit-local-address-space.ll
+++ b/test/CodeGen/R600/32-bit-local-address-space.ll
@@ -15,7 +15,7 @@
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
 define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
-  %0 = load i32 addrspace(3)* %in
+  %0 = load i32, i32 addrspace(3)* %in
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
@@ -26,8 +26,8 @@ entry:
 ; SI: ds_read_b32 [[VPTR]]
 define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
 entry:
-  %0 = getelementptr i32 addrspace(3)* %in, i32 %offset
-  %1 = load i32 addrspace(3)* %0
+  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
+  %1 = load i32, i32 addrspace(3)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
@@ -37,8 +37,8 @@ entry:
 ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
 define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
-  %0 = getelementptr i32 addrspace(3)* %in, i32 1
-  %1 = load i32 addrspace(3)* %0
+  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
+  %1 = load i32, i32 addrspace(3)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
@@ -50,8 +50,8 @@ entry:
 ; SI: ds_read_b32 [[VPTR]]
 define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
-  %0 = getelementptr i32 addrspace(3)* %in, i32 16385
-  %1 = load i32 addrspace(3)* %0
+  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
+  %1 = load i32, i32 addrspace(3)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
@@ -72,8 +72,8 @@ define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds)
 ; SI-NEXT: s_add_i32
 ; SI: ds_read_b32
 define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
-  %ptr = getelementptr [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
-  %val = load float addrspace(3)* %ptr
+  %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
+  %val = load float, float addrspace(3)* %ptr
   store float %val, float addrspace(1)* %out
   ret void
 }
@@ -84,7 +84,7 @@ define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
 define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
-  %val = load float addrspace(3)* @g_lds
+  %val = load float, float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
   ret void
 }
@@ -96,7 +96,7 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti
 ; FUNC-LABEL: {{^}}global_ptr:
 ; SI: ds_write_b32
 define void @global_ptr() nounwind {
-  store i32 addrspace(3)* getelementptr ([16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
+  store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
 
@@ -112,7 +112,7 @@ define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
 ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
 ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
 define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
-  %gep = getelementptr i32 addrspace(3)* %out, i32 %offset
+  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
 }
@@ -122,7 +122,7 @@ define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32
 ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
 define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
-  %gep = getelementptr i32 addrspace(3)* %out, i32 1
+  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
 }
@@ -131,9 +131,9 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v
 ; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_write_b32 [[VPTR]], v{{[0-9]+}} [M0]{{$}}
+; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
 define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
-  %gep = getelementptr i32 addrspace(3)* %out, i32 16385
+  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
deleted file mode 100644
index 9f2738edb6eb..000000000000
--- a/test/CodeGen/R600/64bit-kernel-args.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
-
-; SI: {{^}}f64_kernel_arg:
-; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
-; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
-; SI: buffer_store_dwordx2
-define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
-entry:
-  store double %in, double addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/add-debug.ll b/test/CodeGen/R600/add-debug.ll
index a83c689eb182..529905dd36a2 100644
--- a/test/CodeGen/R600/add-debug.ll
+++ b/test/CodeGen/R600/add-debug.ll
@@ -9,7 +9,7 @@ entry:
   br i1 %0, label %if, label %else
 
 if:
-  %1 = load i64 addrspace(1)* %in
+  %1 = load i64, i64 addrspace(1)* %in
   br label %endif
 
 else:
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index 3a8b97cd87e8..655e75dbc1a4 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll
@@ -9,9 +9,9 @@
 ;SI-NOT: [[REG]]
 ;SI: buffer_store_dword [[REG]],
 define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %a = load i32 addrspace(1)* %in
-  %b = load i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = add i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -25,9 +25,9 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1)* %in
-  %b = load <2 x i32> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
   %result = add <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -45,9 +45,9 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1)* %in
-  %b = load <4 x i32> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
   %result = add <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -62,6 +62,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 ; EG: ADD_INT
 ; EG: ADD_INT
 ; EG: ADD_INT
+
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
@@ -94,6 +95,7 @@ entry:
 ; EG: ADD_INT
 ; EG: ADD_INT
 ; EG: ADD_INT
+
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
@@ -120,6 +122,14 @@ entry:
 ; FUNC-LABEL: {{^}}add64:
 ; SI: s_add_u32
 ; SI: s_addc_u32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
 define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = add i64 %a, %b
@@ -134,9 +144,17 @@ entry:
 
 ; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
 ; SI-NOT: v_addc_u32_e32 s
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
 define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
-  %0 = load i64 addrspace(1)* %in
+  %0 = load i64, i64 addrspace(1)* %in
   %1 = add i64 %a, %0
   store i64 %1, i64 addrspace(1)* %out
   ret void
@@ -146,13 +164,21 @@ entry:
 ; FUNC-LABEL: {{^}}add64_in_branch:
 ; SI: s_add_u32
 ; SI: s_addc_u32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
 define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
 
 if:
-  %1 = load i64 addrspace(1)* %in
+  %1 = load i64, i64 addrspace(1)* %in
   br label %endif
 
 else:
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
index 1769409f5ef1..8346add7df97 100644
--- a/test/CodeGen/R600/add_i64.ll
+++ b/test/CodeGen/R600/add_i64.ll
@@ -8,10 +8,10 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 ; SI: v_addc_u32
 define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr i64 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i64 addrspace(1)* %inB, i32 %tid
-  %a = load i64 addrspace(1)* %a_ptr
-  %b = load i64 addrspace(1)* %b_ptr
+  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a_ptr
+  %b = load i64, i64 addrspace(1)* %b_ptr
   %result = add i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@ define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
 ; SI: v_add_i32
 ; SI: v_addc_u32
 define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
-  %foo = load i64 addrspace(1)* %in, align 8
+  %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %foo, %a
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -35,7 +35,7 @@ define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noal
 ; SI: v_add_i32
 ; SI: v_addc_u32
 define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
-  %foo = load i64 addrspace(1)* %in, align 8
+  %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %a, %foo
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -60,10 +60,10 @@ define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a,
 ; SI: v_addc_u32
 define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr <2 x i64> addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr <2 x i64> addrspace(1)* %inB, i32 %tid
-  %a = load <2 x i64> addrspace(1)* %a_ptr
-  %b = load <2 x i64> addrspace(1)* %b_ptr
+  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
index aaa0628ccdc9..4be8c5847529 100644
--- a/test/CodeGen/R600/address-space.ll
+++ b/test/CodeGen/R600/address-space.ll
@@ -10,18 +10,19 @@
 
 ; CHECK-LABEL: {{^}}do_as_ptr_calcs:
 ; CHECK: s_load_dword [[SREG1:s[0-9]+]],
+; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]]
 ; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:20
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
-  %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
-  %y = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
+  %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
+  %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
   br label %bb32
 
 bb32:
-  %a = load float addrspace(3)* %x, align 4
-  %b = load float addrspace(3)* %y, align 4
+  %a = load float, float addrspace(3)* %x, align 4
+  %b = load float, float addrspace(3)* %y, align 4
   %cmp = fcmp one float %a, %b
   br i1 %cmp, label %bb34, label %bb33
 
diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
index 7a395ccb38d0..5672d470bd7e 100644
--- a/test/CodeGen/R600/and.ll
+++ b/test/CodeGen/R600/and.ll
@@ -10,9 +10,9 @@
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
   %result = and <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -30,9 +30,9 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
   %result = and <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -57,23 +57,41 @@ define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
 ; FUNC-LABEL: {{^}}v_and_i32:
 ; SI: v_and_b32
 define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
-  %a = load i32 addrspace(1)* %aptr, align 4
-  %b = load i32 addrspace(1)* %bptr, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %and = and i32 %a, %b
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_and_constant_i32:
-; SI: v_and_b32
+; FUNC-LABEL: {{^}}v_and_constant_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
 define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
-  %a = load i32 addrspace(1)* %aptr, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_i64:
+; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
+define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, 64
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
+define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, -16
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_i64
 ; SI: s_and_b64
 define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and = and i64 %a, %b
@@ -90,8 +108,8 @@ define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_constant_i64:
-; SI: s_and_b64
+; FUNC-LABEL: {{^}}s_and_constant_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 281474976710655
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -102,8 +120,8 @@ define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI: v_and_b32
 ; SI: v_and_b32
 define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
-  %a = load i64 addrspace(1)* %aptr, align 8
-  %b = load i64 addrspace(1)* %bptr, align 8
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
   %and = and i64 %a, %b
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -118,8 +136,8 @@ entry:
   br i1 %tmp0, label %if, label %endif
 
 if:
-  %a = load i64 addrspace(1)* %aptr, align 8
-  %b = load i64 addrspace(1)* %bptr, align 8
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
   %and = and i64 %a, %b
   br label %endif
 
@@ -133,7 +151,7 @@ endif:
 ; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
-  %a = load i64 addrspace(1)* %aptr, align 8
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -144,16 +162,135 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
 define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
-  %a = load i64 addrspace(1)* %aptr, align 8
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_and_inline_imm_i64:
+; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64
 ; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
-define void @s_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1
+define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 1
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
+define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4607182418800017408
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
+define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13830554455654793216
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
+define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4602678819172646912
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
+define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13826050856027422720
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0
+define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4611686018427387904
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0
+define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13835058055282163712
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
+define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4616189618054758400
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
+define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13839561654909534208
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+
+; Test with the 64-bit integer bitpattern for a 32-bit float in the
+; low 32-bits, which is not a valid 64-bit inline immmediate.
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 1082130432
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Copy of -1 register
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
+; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}
+define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, -1065353216
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Shift into upper 32-bits
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4647714815446351872
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13871086852301127680
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
index 33a8aee0164d..8c2a0795860d 100644
--- a/test/CodeGen/R600/array-ptr-calc-i32.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i32.ll
@@ -20,24 +20,24 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
 ; alloca to a vector.  It currently fails because it does not know how
 ; to interpret:
-; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
+; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
 
 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %alloca = alloca [4 x i32], i32 4, align 16
   %tid = call i32 @llvm.SI.tid() readnone
-  %a_ptr = getelementptr i32 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i32 addrspace(1)* %inB, i32 %tid
-  %a = load i32 addrspace(1)* %a_ptr
-  %b = load i32 addrspace(1)* %b_ptr
+  %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a_ptr
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = add i32 %a, %b
-  %alloca_ptr = getelementptr [4 x i32]* %alloca, i32 1, i32 %b
+  %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
   store i32 %result, i32* %alloca_ptr, align 4
   ; Dummy call
   call void @llvm.AMDGPU.barrier.local() nounwind noduplicate
-  %reload = load i32* %alloca_ptr, align 4
-  %out_ptr = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %reload = load i32, i32* %alloca_ptr, align 4
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll
index 32e657db7bc6..eae095eb8449 100644
--- a/test/CodeGen/R600/array-ptr-calc-i64.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i64.ll
@@ -7,10 +7,10 @@ declare i32 @llvm.SI.tid() readnone
 ; SI: v_mul_hi_i32
 define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.SI.tid() readnone
-  %a_ptr = getelementptr [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
-  %b_ptr = getelementptr i32 addrspace(1)* %inB, i32 %tid
-  %a = load i32 addrspace(1)* %a_ptr
-  %b = load i32 addrspace(1)* %b_ptr
+  %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a_ptr
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = add i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
index 0d5ece4b0e0b..ef2560ef1849 100644
--- a/test/CodeGen/R600/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll
@@ -1,16 +1,19 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC  %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; SI: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -18,18 +21,20 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; SI: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 [M0]
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
+; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -38,12 +43,12 @@ define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrs
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 [M0]
-; SI: s_endpgm
+; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -51,32 +56,36 @@ define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i3
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
-; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; SI: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
-; SI: s_endpgm
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; SI: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 [M0]
-; SI: s_endpgm
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
   ret void
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
index 5fe05f2996af..20c685447eef 100644
--- a/test/CodeGen/R600/atomic_load_add.ll
+++ b/test/CodeGen/R600/atomic_load_add.ll
@@ -14,7 +14,7 @@ define void @atomic_add_local(i32 addrspace(3)* %local) {
 ; R600: LDS_ADD *
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
-  %gep = getelementptr i32 addrspace(3)* %local, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
 }
@@ -32,7 +32,7 @@ define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
 define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %gep = getelementptr i32 addrspace(3)* %local, i32 5
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
index 40722833d265..4c6f45525b9e 100644
--- a/test/CodeGen/R600/atomic_load_sub.ll
+++ b/test/CodeGen/R600/atomic_load_sub.ll
@@ -14,7 +14,7 @@ define void @atomic_sub_local(i32 addrspace(3)* %local) {
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
-  %gep = getelementptr i32 addrspace(3)* %local, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
 }
@@ -32,7 +32,7 @@ define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
 define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %gep = getelementptr i32 addrspace(3)* %local, i32 5
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll
index 6fe23e912952..32e3fc26106f 100644
--- a/test/CodeGen/R600/bfe_uint.ll
+++ b/test/CodeGen/R600/bfe_uint.ll
@@ -1,7 +1,5 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; XFAIL: *
-
 ; CHECK: {{^}}bfe_def:
 ; CHECK: BFE_UINT
 define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
diff --git a/test/CodeGen/R600/big_alu.ll b/test/CodeGen/R600/big_alu.ll
index 28be216e76f2..2671c5d102b3 100644
--- a/test/CodeGen/R600/big_alu.ll
+++ b/test/CodeGen/R600/big_alu.ll
@@ -51,29 +51,29 @@ main_body:
   %43 = extractelement <4 x float> %reg7, i32 1
   %44 = extractelement <4 x float> %reg7, i32 2
   %45 = extractelement <4 x float> %reg7, i32 3
-  %46 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
   %47 = extractelement <4 x float> %46, i32 0
-  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
   %49 = extractelement <4 x float> %48, i32 1
-  %50 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
   %51 = extractelement <4 x float> %50, i32 2
-  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
   %53 = extractelement <4 x float> %52, i32 0
-  %54 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
   %55 = extractelement <4 x float> %54, i32 0
-  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
   %57 = extractelement <4 x float> %56, i32 1
-  %58 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
   %59 = extractelement <4 x float> %58, i32 2
-  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
   %61 = extractelement <4 x float> %60, i32 3
-  %62 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
   %63 = extractelement <4 x float> %62, i32 0
-  %64 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
   %65 = extractelement <4 x float> %64, i32 1
-  %66 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
   %67 = extractelement <4 x float> %66, i32 2
-  %68 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
   %69 = extractelement <4 x float> %68, i32 0
   %70 = fcmp oge float %69, 3.500000e+00
   %71 = sext i1 %70 to i32
@@ -81,7 +81,7 @@ main_body:
   %73 = bitcast float %72 to i32
   %74 = icmp ne i32 %73, 0
   %. = select i1 %74, float 0.000000e+00, float 0.000000e+00
-  %75 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
   %76 = extractelement <4 x float> %75, i32 0
   %77 = fcmp oge float %76, 2.000000e+00
   %78 = sext i1 %77 to i32
@@ -135,7 +135,7 @@ IF137:                                            ; preds = %main_body
   %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3
   %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123)
   %125 = fdiv float 1.000000e+00, %124
-  %126 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
   %127 = extractelement <4 x float> %126, i32 0
   %128 = fmul float %127, %125
   %129 = fmul float %103, %128
@@ -347,15 +347,15 @@ ENDIF136:                                         ; preds = %main_body, %ENDIF15
   %329 = fmul float %314, %328
   %330 = fmul float %316, %328
   %331 = fmul float %318, %328
-  %332 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
   %333 = extractelement <4 x float> %332, i32 0
   %334 = fsub float -0.000000e+00, %333
   %335 = fadd float 1.000000e+00, %334
-  %336 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
   %337 = extractelement <4 x float> %336, i32 0
   %338 = fsub float -0.000000e+00, %337
   %339 = fadd float 1.000000e+00, %338
-  %340 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
   %341 = extractelement <4 x float> %340, i32 0
   %342 = fsub float -0.000000e+00, %341
   %343 = fadd float 1.000000e+00, %342
@@ -1018,7 +1018,7 @@ ENDIF175:                                         ; preds = %ENDIF172, %IF176
   %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ]
   %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ]
   %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ]
-  %880 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
   %881 = extractelement <4 x float> %880, i32 0
   %882 = fcmp olt float %881, %179
   %883 = sext i1 %882 to i32
@@ -1114,12 +1114,12 @@ ENDIF178:                                         ; preds = %ENDIF175, %IF179
   %960 = fmul float %temp87.6, %956
   %961 = fmul float %2, -2.000000e+00
   %962 = fadd float %961, 1.000000e+00
-  %963 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
+  %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
   %964 = extractelement <4 x float> %963, i32 2
   %965 = fsub float -0.000000e+00, %964
   %966 = fadd float %962, %965
   %967 = fdiv float 1.000000e+00, %966
-  %968 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
+  %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
   %969 = extractelement <4 x float> %968, i32 2
   %970 = fmul float %969, %967
   %971 = fsub float -0.000000e+00, %53
diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
index 1ba64af7dca3..fd56d956bf31 100644
--- a/test/CodeGen/R600/bitcast.ll
+++ b/test/CodeGen/R600/bitcast.ll
@@ -9,7 +9,7 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 ; SI: s_endpgm
 define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
-  %1 = load <32 x i8> addrspace(2)* %0
+  %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
   %2 = bitcast <32 x i8> %1 to <8 x i32>
   %3 = extractelement <8 x i32> %2, i32 1
   %4 = icmp ne i32 %3, 0
@@ -23,34 +23,34 @@ entry:
 define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
-  %1 = load <16 x i8> addrspace(1)* %0
+  %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
   store <16 x i8> %1, <16 x i8> addrspace(1)* %out
   ret void
 }
 
 define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %load = load float addrspace(1)* %in, align 4
+  %load = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %load to <2 x i16>
   store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
   ret void
 }
 
 define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
-  %load = load <2 x i16> addrspace(1)* %in, align 4
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %bc = bitcast <2 x i16> %load to float
   store float %bc, float addrspace(1)* %out, align 4
   ret void
 }
 
 define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %load = load <4 x i8> addrspace(1)* %in, align 4
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -59,7 +59,7 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou
 ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
 ; SI: s_endpgm
 define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %val = load <2 x i32> addrspace(1)* %in, align 8
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %add = add <2 x i32> %val, <i32 4, i32 9>
   %bc = bitcast <2 x i32> %add to double
   store double %bc, double addrspace(1)* %out, align 8
@@ -69,7 +69,7 @@ define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
 ; SI: s_endpgm
 define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
-  %val = load double addrspace(1)* %in, align 8
+  %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
   store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/bswap.ll b/test/CodeGen/R600/bswap.ll
index e93543de49da..4cf8e4bfed50 100644
--- a/test/CodeGen/R600/bswap.ll
+++ b/test/CodeGen/R600/bswap.ll
@@ -18,7 +18,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
   store i32 %bswap, i32 addrspace(1)* %out, align 4
   ret void
@@ -33,7 +33,7 @@ define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
 define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
-  %val = load <2 x i32> addrspace(1)* %in, align 8
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -54,7 +54,7 @@ define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
 define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
-  %val = load <4 x i32> addrspace(1)* %in, align 16
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -87,28 +87,28 @@ define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
 define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
-  %val = load <8 x i32> addrspace(1)* %in, align 32
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
   ret void
 }
 
 define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
-  %val = load i64 addrspace(1)* %in, align 8
+  %val = load i64, i64 addrspace(1)* %in, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
   store i64 %bswap, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
-  %val = load <2 x i64> addrspace(1)* %in, align 16
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
   store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
 define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
-  %val = load <4 x i64> addrspace(1)* %in, align 32
+  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
   store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
   ret void
diff --git a/test/CodeGen/R600/call.ll b/test/CodeGen/R600/call.ll
index 9a0eb1cc3fa0..e769fd11c282 100644
--- a/test/CodeGen/R600/call.ll
+++ b/test/CodeGen/R600/call.ll
@@ -8,9 +8,9 @@
 declare i32 @external_function(i32) nounwind
 
 define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %a = load i32 addrspace(1)* %in
-  %b = load i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %c = call i32 @external_function(i32 %b) nounwind
   %result = add i32 %a, %c
   store i32 %result, i32 addrspace(1)* %out
@@ -23,9 +23,9 @@ define i32 @defined_function(i32 %x) nounwind noinline {
 }
 
 define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %a = load i32 addrspace(1)* %in
-  %b = load i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %c = call i32 @defined_function(i32 %b) nounwind
   %result = add i32 %a, %c
   store i32 %result, i32 addrspace(1)* %out
diff --git a/test/CodeGen/R600/call_fs.ll b/test/CodeGen/R600/call_fs.ll
index db2cb6e5011c..87bebbc49d52 100644
--- a/test/CodeGen/R600/call_fs.ll
+++ b/test/CodeGen/R600/call_fs.ll
@@ -2,11 +2,11 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG %s
 ; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600 %s
 
-; EG: {{^}}call_fs:
 ; EG: .long 257
+; EG: {{^}}call_fs:
 ; EG: CALL_FS  ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84]
-; R600: {{^}}call_fs:
 ; R600: .long 257
+; R600: {{^}}call_fs:
 ; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
 
 
diff --git a/test/CodeGen/R600/coalescer_remat.ll b/test/CodeGen/R600/coalescer_remat.ll
new file mode 100644
index 000000000000..f78a77b36154
--- /dev/null
+++ b/test/CodeGen/R600/coalescer_remat.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s
+target triple="amdgcn--"
+
+declare float @llvm.fma.f32(float, float, float)
+
+; This checks that rematerialization support of the coalescer does not
+; unnecessarily widen the register class. Without those fixes > 20 VGprs
+; are used here
+; Also check that some rematerialization of the 0 constant happened.
+; CHECK-LABEL: foobar
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK: ; NumVgprs: 12
+define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
+entry:
+  %cmpflag = icmp eq i32 %flag, 1
+  br i1 %cmpflag, label %loop, label %exit
+
+loop:
+  %c = phi i32 [0, %entry], [%cnext, %loop]
+  %v0 = phi float [0.0, %entry], [%fma.0, %loop]
+  %v1 = phi float [0.0, %entry], [%fma.1, %loop]
+  %v2 = phi float [0.0, %entry], [%fma.2, %loop]
+  %v3 = phi float [0.0, %entry], [%fma.3, %loop]
+
+  ; Try to get the 0 constant to get coalesced into a wide register
+  %blup = insertelement <4 x float> undef, float %v0, i32 0
+  store <4 x float> %blup, <4 x float> addrspace(1)* %out
+
+  %load = load <4 x float>, <4 x float> addrspace(1)* %in
+  %load.0 = extractelement <4 x float> %load, i32 0
+  %load.1 = extractelement <4 x float> %load, i32 1
+  %load.2 = extractelement <4 x float> %load, i32 2
+  %load.3 = extractelement <4 x float> %load, i32 3
+  %fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0)
+  %fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1)
+  %fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2)
+  %fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3)
+
+  %cnext = add nsw i32 %c, 1
+  %cmp = icmp eq i32 %cnext, 42
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  %ev0 = phi float [0.0, %entry], [%fma.0, %loop]
+  %ev1 = phi float [0.0, %entry], [%fma.1, %loop]
+  %ev2 = phi float [0.0, %entry], [%fma.2, %loop]
+  %ev3 = phi float [0.0, %entry], [%fma.3, %loop]
+  %dst.0 = insertelement <4 x float> undef,  float %ev0, i32 0
+  %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1
+  %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2
+  %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3
+  store <4 x float> %dst.3, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
index e16a397bb5a4..0aecc189e0bf 100644
--- a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
@@ -14,7 +14,7 @@ define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
 entry:
   %0 = mul nsw i32 %a, 3
   %1 = sext i32 %0 to i64
-  %2 = getelementptr i8 addrspace(1)* %in, i64 %1
+  %2 = getelementptr i8, i8 addrspace(1)* %in, i64 %1
   store i8 %b, i8 addrspace(1)* %2
   ret void
 }
diff --git a/test/CodeGen/R600/combine_vloads.ll b/test/CodeGen/R600/combine_vloads.ll
index 38420b25cba9..01572afa6205 100644
--- a/test/CodeGen/R600/combine_vloads.ll
+++ b/test/CodeGen/R600/combine_vloads.ll
@@ -23,7 +23,7 @@ for.body:                                         ; preds = %for.body, %entry
   %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ]
   %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)*
   %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)*
-  %vecload2 = load <8 x i32> addrspace(1)* %0, align 32
+  %vecload2 = load <8 x i32>, <8 x i32> addrspace(1)* %0, align 32
   %1 = bitcast <8 x i32> %vecload2 to <32 x i8>
   %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -32,7 +32,7 @@ for.body:                                         ; preds = %for.body, %entry
   %tmp13 = add nsw <8 x i8> %tmp9, %tmp12
   %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %tmp17 = add nsw <8 x i8> %tmp13, %tmp16
-  %scevgep = getelementptr <8 x i8> addrspace(1)* %result, i32 %i.01
+  %scevgep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %result, i32 %i.01
   %2 = bitcast <8 x i8> %tmp17 to <2 x i32>
   %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)*
   store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8
diff --git a/test/CodeGen/R600/commute-compares.ll b/test/CodeGen/R600/commute-compares.ll
new file mode 100644
index 000000000000..31766047a358
--- /dev/null
+++ b/test/CodeGen/R600/commute-compares.ll
@@ -0,0 +1,697 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+; --------------------------------------------------------------------------------
+; i32 compares
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}commute_eq_64_i32:
+; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp eq i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ne_64_i32:
+; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ne i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FIXME: Why isn't this being folded as a constant?
+; GCN-LABEL: {{^}}commute_ne_litk_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
+; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}}
+define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ne i32 %val, 12345
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_64_i32:
+; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ugt i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_64_i32:
+; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
+define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp uge i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_64_i32:
+; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ult i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_63_i32:
+; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ule i32 %val, 63
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
+
+; GCN-LABEL: {{^}}commute_ule_64_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
+; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
+define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ule i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
+; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
+define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp sgt i32 %val, -1
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sge_neg2_i32:
+; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
+define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp sge i32 %val, -2
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_slt_neg16_i32:
+; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
+define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp slt i32 %val, -16
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sle_5_i32:
+; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
+define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp sle i32 %val, 5
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; i64 compares
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}commute_eq_64_i64:
+; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp eq i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ne_64_i64:
+; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ne i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_64_i64:
+; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ugt i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_64_i64:
+; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp uge i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_64_i64:
+; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ult i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_63_i64:
+; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ule i64 %val, 63
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
+
+; GCN-LABEL: {{^}}commute_ule_64_i64:
+; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
+; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ule i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
+; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp sgt i64 %val, -1
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sge_neg2_i64:
+; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp sge i64 %val, -2
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_slt_neg16_i64:
+; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp slt i64 %val, -16
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sle_5_i64:
+; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp sle i64 %val, 5
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; f32 compares
+; --------------------------------------------------------------------------------
+
+
+; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
+; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp oeq float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
+; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ogt float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_oge_2.0_f32:
+; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp oge float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_olt_2.0_f32:
+; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp olt float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ole_2.0_f32:
+; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ole float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_one_2.0_f32:
+; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp one float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ord_2.0_f32:
+; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
+define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ord float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
+; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ueq float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
+; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ugt float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_2.0_f32:
+; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp uge float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_2.0_f32:
+; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ult float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_2.0_f32:
+; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ule float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_une_2.0_f32:
+; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp une float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uno_2.0_f32:
+; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
+define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp uno float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; f64 compares
+; --------------------------------------------------------------------------------
+
+
+; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
+; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp oeq double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
+; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ogt double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_oge_2.0_f64:
+; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp oge double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_olt_2.0_f64:
+; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp olt double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ole_2.0_f64:
+; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ole double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_one_2.0_f64:
+; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp one double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ord_2.0_f64:
+; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
+define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ord double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
+; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ueq double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
+; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ugt double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_2.0_f64:
+; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp uge double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_2.0_f64:
+; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ult double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_2.0_f64:
+; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ule double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_une_2.0_f64:
+; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp une double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uno_2.0_f64:
+; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
+define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp uno double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/commute_modifiers.ll b/test/CodeGen/R600/commute_modifiers.ll
index 6fddb6d595c9..7fc36eabb780 100644
--- a/test/CodeGen/R600/commute_modifiers.ll
+++ b/test/CodeGen/R600/commute_modifiers.ll
@@ -10,8 +10,8 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %x = load float addrspace(1)* %gep.0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %z = fadd float 2.0, %x.fabs
   store float %z, float addrspace(1)* %out
@@ -24,8 +24,8 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %x = load float addrspace(1)* %gep.0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs
   %z = fmul float 4.0, %x.fneg.fabs
@@ -39,8 +39,8 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %x = load float addrspace(1)* %gep.0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
   %x.fneg = fsub float -0.000000e+00, %x
   %z = fmul float 4.0, %x.fneg
   store float %z, float addrspace(1)* %out
@@ -55,8 +55,8 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %x = load float addrspace(1)* %gep.0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %z = fadd float 1024.0, %x.fabs
   store float %z, float addrspace(1)* %out
@@ -70,10 +70,10 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %x = load float addrspace(1)* %gep.0
-  %y = load float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %z = fadd float %x, %y.fabs
   store float %z, float addrspace(1)* %out
@@ -87,10 +87,10 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %x = load float addrspace(1)* %gep.0
-  %y = load float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
   %y.fneg = fsub float -0.000000e+00, %y
   %z = fmul float %x, %y.fneg
   store float %z, float addrspace(1)* %out
@@ -104,10 +104,10 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %x = load float addrspace(1)* %gep.0
-  %y = load float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
   %z = fmul float %x, %y.fabs.fneg
@@ -123,10 +123,10 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %x = load float addrspace(1)* %gep.0
-  %y = load float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %z = fmul float %x.fabs, %y.fabs
@@ -141,10 +141,10 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
 ; SI-NEXT: buffer_store_dword [[REG]]
 define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %x = load float addrspace(1)* %gep.0
-  %y = load float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
@@ -163,12 +163,12 @@ define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float
 ; SI: buffer_store_dword [[RESULT]]
 define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float addrspace(1)* %gep.0
-  %r2 = load float addrspace(1)* %gep.1
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
 
   %r2.fabs = call float @llvm.fabs.f32(float %r2)
 
diff --git a/test/CodeGen/R600/concat_vectors.ll b/test/CodeGen/R600/concat_vectors.ll
index b27bed3d4265..a09ed1f73857 100644
--- a/test/CodeGen/R600/concat_vectors.ll
+++ b/test/CodeGen/R600/concat_vectors.ll
@@ -288,7 +288,7 @@ define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <1
 ; SI: s_endpgm
 define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 bb:
-  %tmp = load <2 x float> addrspace(1)* %in, align 4
+  %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
   %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %tmp2 = shufflevector <8 x float> undef, <8 x float> %tmp1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
   store <8 x float> %tmp2, <8 x float> addrspace(1)* %out, align 32
diff --git a/test/CodeGen/R600/copy-illegal-type.ll b/test/CodeGen/R600/copy-illegal-type.ll
index 56c43d23b4a1..8b397566066a 100644
--- a/test/CodeGen/R600/copy-illegal-type.ll
+++ b/test/CodeGen/R600/copy-illegal-type.ll
@@ -6,7 +6,7 @@
 ; SI: buffer_store_dword [[REG]]
 ; SI: s_endpgm
 define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
@@ -17,7 +17,7 @@ define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)*
 ; SI: buffer_store_dword [[REG]]
 ; SI: s_endpgm
 define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
   ret void
@@ -30,7 +30,7 @@ define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; SI: buffer_store_dword [[REG]]
 ; SI: s_endpgm
 define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
@@ -45,7 +45,7 @@ define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; SI: buffer_store_dword [[REG]]
 ; SI: s_endpgm
 define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
@@ -82,7 +82,7 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 
 ; SI: s_endpgm
 define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
@@ -120,7 +120,7 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add
 
 ; SI: s_endpgm
 define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
@@ -133,7 +133,7 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8>
 ; SI-NOT: bfi
 ; SI: s_endpgm
 define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
-  %val = load <3 x i8> addrspace(1)* %in, align 4
+  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
   ret void
 }
@@ -145,7 +145,7 @@ define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)*
 ; SI: buffer_load_ubyte
 ; SI: s_endpgm
 define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load volatile <4 x i8> addrspace(1)* %in, align 4
+  %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
@@ -161,7 +161,7 @@ define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8>
 ; SI: buffer_store_byte
 ; SI: s_endpgm
 define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/copy-to-reg.ll b/test/CodeGen/R600/copy-to-reg.ll
index 9c1de73b3b1b..fc875f6ef7a3 100644
--- a/test/CodeGen/R600/copy-to-reg.ll
+++ b/test/CodeGen/R600/copy-to-reg.ll
@@ -13,15 +13,15 @@ entry:
 
 loop:
   %inc = phi i32 [0, %entry], [%inc.i, %loop]
-  %ptr = getelementptr [16 x i32]* %alloca, i32 0, i32 %inc
+  %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc
   store i32 %inc, i32* %ptr
   %inc.i = add i32 %inc, 1
   %cnd = icmp uge i32 %inc.i, 16
   br i1 %cnd, label %done, label %loop
 
 done:
-  %tmp0 = getelementptr [16 x i32]* %alloca, i32 0, i32 0
-  %tmp1 = load i32* %tmp0
+  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0
+  %tmp1 = load i32, i32* %tmp0
   store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
index 1a4317b8095c..bd26c302fe5a 100644
--- a/test/CodeGen/R600/ctlz_zero_undef.ll
+++ b/test/CodeGen/R600/ctlz_zero_undef.ll
@@ -28,7 +28,7 @@ define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32 addrspace(1)* %valptr, align 4
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -44,7 +44,7 @@ define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <2 x i32> addrspace(1)* %valptr, align 8
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -64,7 +64,7 @@ define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <4 x i32> addrspace(1)* %valptr, align 16
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
index c64f443ad697..0a031c5e24d1 100644
--- a/test/CodeGen/R600/ctpop.ll
+++ b/test/CodeGen/R600/ctpop.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
@@ -8,11 +9,11 @@ declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}s_ctpop_i32:
-; SI: s_load_dword [[SVAL:s[0-9]+]],
-; SI: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
+; GCN: s_load_dword [[SVAL:s[0-9]+]],
+; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
@@ -23,32 +24,33 @@ define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 
 ; XXX - Why 0 in register?
 ; FUNC-LABEL: {{^}}v_ctpop_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
-; SI: buffer_load_dword [[VAL0:v[0-9]+]],
-; SI: buffer_load_dword [[VAL1:v[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
-; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
-  %val0 = load i32 addrspace(1)* %in0, align 4
-  %val1 = load i32 addrspace(1)* %in1, align 4
+  %val0 = load i32, i32 addrspace(1)* %in0, align 4
+  %val1 = load i32, i32 addrspace(1)* %in1, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
   %add = add i32 %ctpop0, %ctpop1
@@ -57,13 +59,13 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
-; SI: buffer_load_dword [[VAL0:v[0-9]+]],
-; SI-NEXT: s_waitcnt
-; SI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
-; SI-NEXT: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
-  %val0 = load i32 addrspace(1)* %in0, align 4
+  %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %add = add i32 %ctpop0, %sval
   store i32 %add, i32 addrspace(1)* %out, align 4
@@ -71,47 +73,47 @@ define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v2i32:
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <2 x i32> addrspace(1)* %in, align 8
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v4i32:
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <4 x i32> addrspace(1)* %in, align 16
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v8i32:
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -122,30 +124,30 @@ define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <8 x i32> addrspace(1)* %in, align 32
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v16i32:
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: v_bcnt_u32_b32_e64
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -164,21 +166,21 @@ define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <16 x i32> addrspace(1)* %in, align 32
+  %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
   %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
   store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 4
   store i32 %add, i32 addrspace(1)* %out, align 4
@@ -186,14 +188,14 @@ define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 4, %ctpop
   store i32 %add, i32 addrspace(1)* %out, align 4
@@ -201,13 +203,14 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 99999
   store i32 %add, i32 addrspace(1)* %out, align 4
@@ -215,15 +218,15 @@ define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspa
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
-; SI-DAG: s_load_dword [[VAR:s[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, %const
   store i32 %add, i32 addrspace(1)* %out, align 4
@@ -231,15 +234,15 @@ define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
-; SI-DAG: s_load_dword [[VAR:s[0-9]+]],
-; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %const, %ctpop
   store i32 %add, i32 addrspace(1)* %out, align 4
@@ -247,18 +250,19 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
-; SI-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
+; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  %gep = getelementptr i32 addrspace(1)* %constptr, i32 4
-  %const = load i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
+  %const = load i32, i32 addrspace(1)* %gep, align 4
   %add = add i32 %const, %ctpop
   store i32 %add, i32 addrspace(1)* %out, align 4
   ret void
@@ -269,10 +273,11 @@ define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrsp
 
 ; FUNC-LABEL: {{^}}ctpop_i32_in_br:
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
-; SI: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: v_mov_b32_e32 [[RESULT]], [[SRESULT]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
+; GCN: v_mov_b32_e32 [[RESULT]], [[SRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 ; EG: BCNT_INT
 define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
@@ -284,8 +289,8 @@ if:
   br label %endif
 
 else:
-  %tmp3 = getelementptr i32 addrspace(1)* %in, i32 1
-  %tmp4 = load i32 addrspace(1)* %tmp3
+  %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %tmp4 = load i32, i32 addrspace(1)* %tmp3
   br label %endif
 
 endif:
diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll
index 9758ac96ea9b..e1a0ee3ea217 100644
--- a/test/CodeGen/R600/ctpop64.ll
+++ b/test/CodeGen/R600/ctpop64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
@@ -8,10 +9,11 @@ declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}s_ctpop_i64:
 ; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
-; SI: s_endpgm
+; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
 define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
@@ -20,13 +22,14 @@ define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i64:
-; SI: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; SI: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
+; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
-  %val = load i64 addrspace(1)* %in, align 8
+  %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -34,9 +37,9 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali
 }
 
 ; FUNC-LABEL: {{^}}s_ctpop_v2i64:
-; SI: s_bcnt1_i32_b64
-; SI: s_bcnt1_i32_b64
-; SI: s_endpgm
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_endpgm
 define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -45,11 +48,11 @@ define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val)
 }
 
 ; FUNC-LABEL: {{^}}s_ctpop_v4i64:
-; SI: s_bcnt1_i32_b64
-; SI: s_bcnt1_i32_b64
-; SI: s_bcnt1_i32_b64
-; SI: s_bcnt1_i32_b64
-; SI: s_endpgm
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_endpgm
 define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -58,13 +61,13 @@ define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val)
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v2i64:
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: s_endpgm
 define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
-  %val = load <2 x i64> addrspace(1)* %in, align 16
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
   store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -72,17 +75,17 @@ define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_v4i64:
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: v_bcnt_u32_b32
-; SI: s_endpgm
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: s_endpgm
 define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
-  %val = load <4 x i64> addrspace(1)* %in, align 32
+  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
   store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -94,11 +97,12 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
 
 ; FUNC-LABEL: {{^}}ctpop_i64_in_br:
 ; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
-; SI: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
-; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
-; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
-; SI: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
-; SI: s_endpgm
+; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
+; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: s_endpgm
 define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
@@ -109,8 +113,8 @@ if:
   br label %endif
 
 else:
-  %tmp3 = getelementptr i64 addrspace(1)* %in, i32 1
-  %tmp4 = load i64 addrspace(1)* %tmp3
+  %tmp3 = getelementptr i64, i64 addrspace(1)* %in, i32 1
+  %tmp4 = load i64, i64 addrspace(1)* %tmp3
   br label %endif
 
 endif:
diff --git a/test/CodeGen/R600/cttz-ctlz.ll b/test/CodeGen/R600/cttz-ctlz.ll
deleted file mode 100644
index c957a033c5d7..000000000000
--- a/test/CodeGen/R600/cttz-ctlz.ll
+++ /dev/null
@@ -1,225 +0,0 @@
-; RUN: opt -S -codegenprepare -mtriple=r600-unknown-unknown -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=ALL %s
-; RUN: opt -S -codegenprepare -mtriple=r600-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=ALL %s
-
-
-define i64 @test1(i64 %A) {
-; ALL-LABEL: @test1(
-; SI: [[CTLZ:%[A-Za-z0-9]+]] = call i64 @llvm.ctlz.i64(i64 %A, i1 false)
-; SI-NEXT: ret i64 [[CTLZ]]
-entry:
-  %tobool = icmp eq i64 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
-  ret i64 %cond
-}
-
-
-define i32 @test2(i32 %A) {
-; ALL-LABEL: @test2(
-; SI: [[CTLZ:%[A-Za-z0-9]+]] = call i32 @llvm.ctlz.i32(i32 %A, i1 false)
-; SI-NEXT: ret i32 [[CTLZ]]
-entry:
-  %tobool = icmp eq i32 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
-  ret i32 %cond
-}
-
-
-define signext i16 @test3(i16 signext %A) {
-; ALL-LABEL: @test3(
-; SI: [[CTLZ:%[A-Za-z0-9]+]] = call i16 @llvm.ctlz.i16(i16 %A, i1 false)
-; SI-NEXT: ret i16 [[CTLZ]]
-entry:
-  %tobool = icmp eq i16 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
-  ret i16 %cond
-}
-
-
-define i64 @test1b(i64 %A) {
-; ALL-LABEL: @test1b(
-; SI: [[CTTZ:%[A-Za-z0-9]+]] = call i64 @llvm.cttz.i64(i64 %A, i1 false)
-; SI-NEXT: ret i64 [[CTTZ]]
-entry:
-  %tobool = icmp eq i64 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
-  ret i64 %cond
-}
-
-
-define i32 @test2b(i32 %A) {
-; ALL-LABEL: @test2b(
-; SI: [[CTTZ:%[A-Za-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %A, i1 false)
-; SI-NEXT: ret i32 [[CTTZ]]
-entry:
-  %tobool = icmp eq i32 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
-  ret i32 %cond
-}
-
-
-define signext i16 @test3b(i16 signext %A) {
-; ALL-LABEL: @test3b(
-; SI: [[CTTZ:%[A-Za-z0-9]+]] = call i16 @llvm.cttz.i16(i16 %A, i1 false)
-; SI-NEXT: ret i16 [[CTTZ]]
-entry:
-  %tobool = icmp eq i16 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
-  ret i16 %cond
-}
-
-
-define i64 @test1c(i64 %A) {
-; ALL-LABEL: @test1c(
-; ALL: icmp eq i64 %A, 0
-; ALL: call i64 @llvm.ctlz.i64(i64 %A, i1 true)
-entry:
-  %tobool = icmp eq i64 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ]
-  ret i64 %cond
-}
-
-define i32 @test2c(i32 %A) {
-; ALL-LABEL: @test2c(
-; ALL: icmp eq i32 %A, 0
-; ALL: call i32 @llvm.ctlz.i32(i32 %A, i1 true)
-entry:
-  %tobool = icmp eq i32 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ]
-  ret i32 %cond
-}
-
-
-define signext i16 @test3c(i16 signext %A) {
-; ALL-LABEL: @test3c(
-; ALL: icmp eq i16 %A, 0
-; ALL: call i16 @llvm.ctlz.i16(i16 %A, i1 true)
-entry:
-  %tobool = icmp eq i16 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]
-  ret i16 %cond
-}
-
-
-define i64 @test1d(i64 %A) {
-; ALL-LABEL: @test1d(
-; ALL: icmp eq i64 %A, 0
-; ALL: call i64 @llvm.cttz.i64(i64 %A, i1 true)
-entry:
-  %tobool = icmp eq i64 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ]
-  ret i64 %cond
-}
-
-
-define i32 @test2d(i32 %A) {
-; ALL-LABEL: @test2d(
-; ALL: icmp eq i32 %A, 0
-; ALL: call i32 @llvm.cttz.i32(i32 %A, i1 true)
-entry:
-  %tobool = icmp eq i32 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ]
-  ret i32 %cond
-}
-
-
-define signext i16 @test3d(i16 signext %A) {
-; ALL-LABEL: @test3d(
-; ALL: icmp eq i16 %A, 0
-; ALL: call i16 @llvm.cttz.i16(i16 %A, i1 true)
-entry:
-  %tobool = icmp eq i16 %A, 0
-  br i1 %tobool, label %cond.end, label %cond.true
-
-cond.true:                                        ; preds = %entry
-  %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
-  br label %cond.end
-
-cond.end:                                         ; preds = %entry, %cond.true
-  %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]
-  ret i16 %cond
-}
-
-
-declare i64 @llvm.ctlz.i64(i64, i1)
-declare i32 @llvm.ctlz.i32(i32, i1)
-declare i16 @llvm.ctlz.i16(i16, i1)
-declare i64 @llvm.cttz.i64(i64, i1)
-declare i32 @llvm.cttz.i32(i32, i1)
-declare i16 @llvm.cttz.i16(i16, i1)
diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll
index d9d284c58865..56fcb51fe14e 100644
--- a/test/CodeGen/R600/cttz_zero_undef.ll
+++ b/test/CodeGen/R600/cttz_zero_undef.ll
@@ -28,7 +28,7 @@ define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32 addrspace(1)* %valptr, align 4
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
   ret void
@@ -44,7 +44,7 @@ define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <2 x i32> addrspace(1)* %valptr, align 8
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -64,7 +64,7 @@ define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <4 x i32> addrspace(1)* %valptr, align 16
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll
index 69eea5919c05..3399d9da29e3 100644
--- a/test/CodeGen/R600/cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/cvt_f32_ubyte.ll
@@ -8,7 +8,7 @@
 ; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
 ; SI: buffer_store_dword [[CONV]],
 define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
-  %load = load i8 addrspace(1)* %in, align 1
+  %load = load i8, i8 addrspace(1)* %in, align 1
   %cvt = uitofp i8 %load to float
   store float %cvt, float addrspace(1)* %out, align 4
   ret void
@@ -23,7 +23,7 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <2 x i8> addrspace(1)* %in, align 2
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
   %cvt = uitofp <2 x i8> %load to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
   ret void
@@ -37,7 +37,7 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
 ; SI-DAG: v_cvt_f32_ubyte0_e32
 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <3 x i8> addrspace(1)* %in, align 4
+  %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <3 x i8> %load to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
   ret void
@@ -53,7 +53,7 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8>
 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8> addrspace(1)* %in, align 4
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -63,10 +63,10 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
 ; position in the word for the component.
 
 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
-; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
-; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
-; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
 ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
 ; SI-NOT: v_lshlrev_b32
 ; SI-NOT: v_or_b32
 
@@ -77,7 +77,7 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
 
 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8> addrspace(1)* %in, align 1
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -105,7 +105,7 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out
 ; XSI: v_cvt_f32_u32_e32
 ; SI: s_endpgm
 define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8> addrspace(1)* %in, align 4
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
   %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
@@ -117,7 +117,7 @@ define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <
 ; SI-LABEL: {{^}}load_v7i8_to_v7f32:
 ; SI: s_endpgm
 define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <7 x i8> addrspace(1)* %in, align 1
+  %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <7 x i8> %load to <7 x float>
   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
   ret void
@@ -146,7 +146,7 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <8 x i8> addrspace(1)* %in, align 8
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
   ret void
@@ -158,7 +158,7 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8>
 ; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 ; SI: buffer_store_dword [[CONV]],
 define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 2
   %inreg = and i32 %add, 255
   %cvt = uitofp i32 %inreg to float
@@ -168,7 +168,7 @@ define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addr
 
 ; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
 define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %inreg = and i32 %load, 65280
   %shr = lshr i32 %inreg, 8
   %cvt = uitofp i32 %shr to float
@@ -180,7 +180,7 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr
 ; We don't get these ones because of the zext, but instcombine removes
 ; them so it shouldn't really matter.
 define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
-  %load = load i8 addrspace(1)* %in, align 1
+  %load = load i8, i8 addrspace(1)* %in, align 1
   %ext = zext i8 %load to i32
   %cvt = uitofp i32 %ext to float
   store float %cvt, float addrspace(1)* %out, align 4
@@ -188,7 +188,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1
 }
 
 define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8> addrspace(1)* %in, align 1
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %ext = zext <4 x i8> %load to <4 x i32>
   %cvt = uitofp <4 x i32> %ext to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
diff --git a/test/CodeGen/R600/cvt_flr_i32_f32.ll b/test/CodeGen/R600/cvt_flr_i32_f32.ll
new file mode 100644
index 000000000000..2dd3a9f2a776
--- /dev/null
+++ b/test/CodeGen/R600/cvt_flr_i32_f32.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_0:
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NOT: add
+; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %floor = call float @llvm.floor.f32(float %x) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_1:
+; SI: v_add_f32_e64 [[TMP:v[0-9]+]], 1.0, s{{[0-9]+}}
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 1.0
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %floor = call float @llvm.floor.f32(float %x.fabs) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fneg:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fneg = fsub float -0.000000e+00, %x
+  %floor = call float @llvm.floor.f32(float %x.fneg) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs_fneg:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
+  %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_cvt_flr_i32_f32_0:
+; SI-NOT: v_cvt_flr_i32_f32
+; SI: v_floor_f32
+; SI: v_cvt_u32_f32_e32
+; SI: s_endpgm
+define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %floor = call float @llvm.floor.f32(float %x) #1
+  %cvt = fptoui float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/cvt_rpi_i32_f32.ll b/test/CodeGen/R600/cvt_rpi_i32_f32.ll
new file mode 100644
index 000000000000..864ac40260b3
--- /dev/null
+++ b/test/CodeGen/R600/cvt_rpi_i32_f32.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %fadd = fadd float %x.fabs, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This doesn't work because it forms fsub 0.5, x
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fneg:
+; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
+; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, s{{[0-9]+}}
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fneg = fsub float -0.000000e+00, %x
+  %fadd = fadd float %x.fneg, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This doesn't work for same reason as above
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs_fneg:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
+
+; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, |s{{[0-9]+}}|
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
+  %fadd = fadd float %x.fabs.fneg, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_cvt_rpi_i32_f32_0:
+; SI-NOT: v_cvt_rpi_i32_f32
+; SI: v_add_f32
+; SI: v_floor_f32
+; SI: v_cvt_u32_f32
+; SI: s_endpgm
+define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptoui float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
index 1e47bfa0c779..fb43ff4fbddd 100644
--- a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -12,8 +12,8 @@
 
 define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %sint = load i32 addrspace(1) * %in
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %sint = load i32, i32 addrspace(1) * %in
   %conv = sitofp i32 %sint to float
   %0 = insertelement <4 x float> undef, float %conv, i32 0
   %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
@@ -26,8 +26,8 @@ entry:
 
 define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %uint = load i32 addrspace(1) * %in
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %uint = load i32, i32 addrspace(1) * %in
   %conv = uitofp i32 %uint to float
   %0 = insertelement <4 x float> undef, float %conv, i32 0
   %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
diff --git a/test/CodeGen/R600/debug.ll b/test/CodeGen/R600/debug.ll
new file mode 100644
index 000000000000..a2e0e878b740
--- /dev/null
+++ b/test/CodeGen/R600/debug.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; Test for a crash in the custom assembly dump code.
+
+; SI: s_endpgm
+define void @test(i32 addrspace(1)* %out) {
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/disconnected-predset-break-bug.ll b/test/CodeGen/R600/disconnected-predset-break-bug.ll
index 858e4b98f3ab..cdd2c0cd4f43 100644
--- a/test/CodeGen/R600/disconnected-predset-break-bug.ll
+++ b/test/CodeGen/R600/disconnected-predset-break-bug.ll
@@ -18,7 +18,7 @@ for.body:                                         ; preds = %for.body, %entry
   %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
   %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
   %i.07 = add nsw i32 %i.07.in, -1
-  %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %ai.06
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
   store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
   %add = add nsw i32 %ai.06, 1
   %exitcond = icmp eq i32 %add, %iterations
diff --git a/test/CodeGen/R600/dot4-folding.ll b/test/CodeGen/R600/dot4-folding.ll
index dca6a59c6e6a..4df7b63bf98e 100644
--- a/test/CodeGen/R600/dot4-folding.ll
+++ b/test/CodeGen/R600/dot4-folding.ll
@@ -14,8 +14,8 @@
 
 define void @main(float addrspace(1)* %out) {
 main_body:
-  %0 = load <4 x float> addrspace(8)* null
-  %1 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1)
   %3 = insertelement <4 x float> undef, float %2, i32 0
   call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0)
diff --git a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
index 41afd503ef88..e7e13d6178c4 100644
--- a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
@@ -18,7 +18,7 @@ declare void @llvm.AMDGPU.barrier.local() #1
 ; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
 
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:0 offset1:1
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
@@ -33,20 +33,20 @@ for.body:                                         ; preds = %for.body, %entry
   %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
   %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   tail call void @llvm.AMDGPU.barrier.local() #1
-  %arrayidx = getelementptr inbounds float addrspace(3)* %lptr, i32 %offset.02
-  %tmp = load float addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02
+  %tmp = load float, float addrspace(3)* %arrayidx, align 4
   %add1 = add nsw i32 %offset.02, 1
-  %arrayidx2 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add1
-  %tmp1 = load float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1
+  %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4
   %add3 = add nsw i32 %offset.02, 32
-  %arrayidx4 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add3
-  %tmp2 = load float addrspace(3)* %arrayidx4, align 4
+  %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3
+  %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4
   %add5 = add nsw i32 %offset.02, 33
-  %arrayidx6 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add5
-  %tmp3 = load float addrspace(3)* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5
+  %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4
   %add7 = add nsw i32 %offset.02, 64
-  %arrayidx8 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add7
-  %tmp4 = load float addrspace(3)* %arrayidx8, align 4
+  %arrayidx8 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add7
+  %tmp4 = load float, float addrspace(3)* %arrayidx8, align 4
   %add9 = fadd float %tmp, %tmp1
   %add10 = fadd float %add9, %tmp2
   %add11 = fadd float %add10, %tmp3
@@ -59,7 +59,7 @@ for.body:                                         ; preds = %for.body, %entry
 
 for.end:                                          ; preds = %for.body
   %tmp5 = sext i32 %x.i to i64
-  %arrayidx15 = getelementptr inbounds float addrspace(1)* %out, i64 %tmp5
+  %arrayidx15 = getelementptr inbounds float, float addrspace(1)* %out, i64 %tmp5
   store float %add13, float addrspace(1)* %arrayidx15, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/ds_read2.ll b/test/CodeGen/R600/ds_read2.ll
index c06b0b1392e2..5929898f8bd8 100644
--- a/test/CodeGen/R600/ds_read2.ll
+++ b/test/CodeGen/R600/ds_read2.ll
@@ -7,39 +7,39 @@
  @lds.f64 = addrspace(3) global [512 x double] undef, align 8
 
 ; SI-LABEL: @simple_read2_f32
-; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:8
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
 
 ; SI-LABEL: @simple_read2_f32_max_offset
-; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:255
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 255
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -51,77 +51,77 @@ define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
 ; SI: s_endpgm
 define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 257
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
 
 ; SI-LABEL: @simple_read2_f32_x2
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:0 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum.0 = fadd float %val0, %val1
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  %val2 = load float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  %val3 = load float addrspace(3)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
   %sum.1 = fadd float %val2, %val3
 
   %sum = fadd float %sum.0, %sum.1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %idx.0
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
 
 ; Make sure there is an instruction between the two sets of reads.
 ; SI-LABEL: @simple_read2_f32_x2_barrier
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:0 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
 ; SI: s_barrier
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum.0 = fadd float %val0, %val1
 
   call void @llvm.AMDGPU.barrier.local() #2
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  %val2 = load float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  %val3 = load float addrspace(3)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
   %sum.1 = fadd float %val2, %val3
 
   %sum = fadd float %sum.0, %sum.1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %idx.0
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -136,25 +136,25 @@ define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
 define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum.0 = fadd float %val0, %val1
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  %val2 = load float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  %val3 = load float addrspace(3)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
   %sum.1 = fadd float %val2, %val3
 
   %sum = fadd float %sum.0, %sum.1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %idx.0
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -174,14 +174,14 @@ define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float ad
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
-  %gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
   %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
   %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
-  %val0 = load float addrspace(3)* %gep.0, align 4
-  %val1 = load float addrspace(3)* %gep.1, align 4
+  %val0 = load float, float addrspace(3)* %gep.0, align 4
+  %val1 = load float, float addrspace(3)* %gep.1, align 4
   %add.x = add nsw i32 %x.i, 8
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -200,18 +200,18 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
-  %gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
   %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
   %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
 
   ; Apply an additional offset after the vector that will be more obviously folded.
-  %gep.1.offset = getelementptr float addrspace(3)* %gep.1, i32 8
+  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
 
-  %val0 = load float addrspace(3)* %gep.0, align 4
-  %val1 = load float addrspace(3)* %gep.1.offset, align 4
+  %val0 = load float, float addrspace(3)* %gep.0, align 4
+  %val1 = load float, float addrspace(3)* %gep.1.offset, align 4
   %add.x = add nsw i32 %x.i, 8
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -228,14 +228,14 @@ define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
   %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
   %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
-  %gep = getelementptr inbounds <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
+  %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
   %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
   %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
-  %val0 = load float addrspace(3)* %gep.0, align 4
-  %val1 = load float addrspace(3)* %gep.1, align 4
+  %val0 = load float, float addrspace(3)* %gep.0, align 4
+  %val1 = load float, float addrspace(3)* %gep.1, align 4
   %add.x = add nsw i32 %x.i, 8
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -247,13 +247,13 @@ define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
 ; SI: s_endpgm
 define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load volatile float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -265,13 +265,13 @@ define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
 ; SI: s_endpgm
 define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load volatile float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -284,13 +284,13 @@ define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
 ; SI: s_endpgm
 define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %x.i
-  %val0 = load float addrspace(3)* %arrayidx0, align 1
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 1
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x
-  %val1 = load float addrspace(3)* %arrayidx1, align 1
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 1
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -300,48 +300,48 @@ define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %
 ; SI: s_endpgm
 define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %x.i
-  %val0 = load float addrspace(3)* %arrayidx0, align 2
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 2
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x
-  %val1 = load float addrspace(3)* %arrayidx1, align 2
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 2
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
 
 ; SI-LABEL: @simple_read2_f64
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
-; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset0:0 offset1:8
+; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2_f64(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 8
   ret void
 }
 
 ; SI-LABEL: @simple_read2_f64_max_offset
-; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:0 offset1:255
+; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_endpgm
 define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 255
-  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 8
   ret void
 }
@@ -353,31 +353,31 @@ define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
 ; SI: s_endpgm
 define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 257
-  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 8
   ret void
 }
 
 ; Alignment only 4
 ; SI-LABEL: @misaligned_read2_f64
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
 ; SI: s_endpgm
 define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
-  %val0 = load double addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 7
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
-  %val1 = load double addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -386,10 +386,10 @@ define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)
 
 ; SI-LABEL: @load_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
 define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
-  %val0 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  %val1 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   %sum = add i32 %val0, %val1
   store i32 %sum, i32 addrspace(1)* %out, align 4
   ret void
@@ -397,10 +397,10 @@ define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
 
 ; SI-LABEL: @load_constant_disjoint_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:2
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
 define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
-  %val0 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  %val1 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   %sum = add i32 %val0, %val1
   store i32 %sum, i32 addrspace(1)* %out, align 4
   ret void
@@ -410,11 +410,11 @@ define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
 
 ; SI-LABEL: @load_misaligned64_constant_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
 define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
-  %val0 = load i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
-  %val1 = load i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   %sum = add i64 %val0, %val1
   store i64 %sum, i64 addrspace(1)* %out, align 8
   ret void
@@ -425,12 +425,12 @@ define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
 ; SI-LABEL: @load_misaligned64_constant_large_offsets
 ; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
 ; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
-; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset0:0 offset1:1
-; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset0:0 offset1:1
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
 ; SI: s_endpgm
 define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
-  %val0 = load i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
-  %val1 = load i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   %sum = add i64 %val0, %val1
   store i64 %sum, i64 addrspace(1)* %out, align 8
   ret void
@@ -442,34 +442,34 @@ define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
 define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
   %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
   %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
-  %arrayidx44 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
-  %tmp16 = load float addrspace(3)* %arrayidx44, align 4
+  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4
   %add47 = add nsw i32 %x.i, 1
-  %arrayidx48 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
-  %tmp17 = load float addrspace(3)* %arrayidx48, align 4
+  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4
   %add51 = add nsw i32 %x.i, 16
-  %arrayidx52 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
-  %tmp18 = load float addrspace(3)* %arrayidx52, align 4
+  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4
   %add55 = add nsw i32 %x.i, 17
-  %arrayidx56 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
-  %tmp19 = load float addrspace(3)* %arrayidx56, align 4
-  %arrayidx60 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
-  %tmp20 = load float addrspace(3)* %arrayidx60, align 4
+  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4
   %add63 = add nsw i32 %y.i, 1
-  %arrayidx64 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
-  %tmp21 = load float addrspace(3)* %arrayidx64, align 4
+  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4
   %add67 = add nsw i32 %y.i, 32
-  %arrayidx68 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
-  %tmp22 = load float addrspace(3)* %arrayidx68, align 4
+  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4
   %add71 = add nsw i32 %y.i, 33
-  %arrayidx72 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
-  %tmp23 = load float addrspace(3)* %arrayidx72, align 4
+  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4
   %add75 = add nsw i32 %y.i, 64
-  %arrayidx76 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
-  %tmp24 = load float addrspace(3)* %arrayidx76, align 4
+  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4
   %add79 = add nsw i32 %y.i, 65
-  %arrayidx80 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
-  %tmp25 = load float addrspace(3)* %arrayidx80, align 4
+  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4
   %sum.0 = fadd float %tmp16, %tmp17
   %sum.1 = fadd float %sum.0, %tmp18
   %sum.2 = fadd float %sum.1, %tmp19
@@ -484,13 +484,13 @@ define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i
 }
 
 define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
-  %load = load <2 x i32> addrspace(3)* %in, align 4
+  %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
   store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
 define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
-  %load = load i64 addrspace(3)* %in, align 4
+  %load = load i64, i64 addrspace(3)* %in, align 4
   store i64 %load, i64 addrspace(1)* %out, align 8
   ret void
 }
diff --git a/test/CodeGen/R600/ds_read2_offset_order.ll b/test/CodeGen/R600/ds_read2_offset_order.ll
index 44306bc9d38f..9ea9a5a2617b 100644
--- a/test/CodeGen/R600/ds_read2_offset_order.ll
+++ b/test/CodeGen/R600/ds_read2_offset_order.ll
@@ -14,31 +14,31 @@
 
 define void @offset_order(float addrspace(1)* %out) {
 entry:
-  %ptr0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 0
-  %val0 = load float addrspace(3)* %ptr0
+  %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
+  %val0 = load float, float addrspace(3)* %ptr0
 
-  %ptr1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 256
-  %val1 = load float addrspace(3)* %ptr1
+  %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256
+  %val1 = load float, float addrspace(3)* %ptr1
   %add1 = fadd float %val0, %val1
 
-  %ptr2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 3
-  %val2 = load float addrspace(3)* %ptr2
+  %ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3
+  %val2 = load float, float addrspace(3)* %ptr2
   %add2 = fadd float %add1, %val2
 
-  %ptr3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 2
-  %val3 = load float addrspace(3)* %ptr3
+  %ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
+  %val3 = load float, float addrspace(3)* %ptr3
   %add3 = fadd float %add2, %val3
 
-  %ptr4 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 12
-  %val4 = load float addrspace(3)* %ptr4
+  %ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
+  %val4 = load float, float addrspace(3)* %ptr4
   %add4 = fadd float %add3, %val4
 
-  %ptr5 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 14
-  %val5 = load float addrspace(3)* %ptr5
+  %ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14
+  %val5 = load float, float addrspace(3)* %ptr5
   %add5 = fadd float %add4, %val5
 
-  %ptr6 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 11
-  %val6 = load float addrspace(3)* %ptr6
+  %ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
+  %val6 = load float, float addrspace(3)* %ptr6
   %add6 = fadd float %add5, %val6
   store float %add6, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
index efd875e93176..54b3b45636d6 100644
--- a/test/CodeGen/R600/ds_read2st64.ll
+++ b/test/CodeGen/R600/ds_read2st64.ll
@@ -5,20 +5,20 @@
 
 
 ; SI-LABEL: @simple_read2st64_f32_0_1
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -32,13 +32,13 @@ define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
 define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 128
-  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -52,13 +52,13 @@ define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(
 define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 16320
-  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -72,13 +72,13 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
 define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 16384
-  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -88,13 +88,13 @@ define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, floa
 ; SI: s_endpgm
 define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 63
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -105,32 +105,32 @@ define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
 define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 127
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
   store float %sum, float addrspace(1)* %out.gep, align 4
   ret void
 }
 
 ; SI-LABEL: @simple_read2st64_f64_0_1
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
 define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 8
   ret void
 }
@@ -144,13 +144,13 @@ define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
 define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 128
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 8
   ret void
 }
@@ -158,18 +158,18 @@ define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspac
 ; Alignment only
 
 ; SI-LABEL: @misaligned_read2st64_f64
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 ; SI: s_endpgm
 define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
-  %val0 = load double addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
-  %val1 = load double addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 4
   ret void
 }
@@ -184,13 +184,13 @@ define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspac
 define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %add.x.0 = add nsw i32 %x.i, 256
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 8128
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 8
   ret void
 }
@@ -204,13 +204,13 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
 define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 8192
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 8
   ret void
 }
@@ -221,13 +221,13 @@ define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, dou
 define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 8129
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 8
   ret void
 }
@@ -237,17 +237,17 @@ define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double
 
 ; SI-LABEL: @byte_size_only_divisible_64_read2_f64
 ; SI-NOT: ds_read2st_b64
-; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:0 offset1:8
+; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_endpgm
 define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
-  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
-  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
   store double %sum, double addrspace(1)* %out.gep, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/ds_write2.ll b/test/CodeGen/R600/ds_write2.ll
index 27273e7c674d..b553d3459e40 100644
--- a/test/CodeGen/R600/ds_write2.ll
+++ b/test/CodeGen/R600/ds_write2.ll
@@ -7,16 +7,16 @@
 ; SI-LABEL: @simple_write2_one_val_f32
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr float addrspace(1)* %in, i32 %x.i
-  %val = load float addrspace(1)* %in.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %val = load float, float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store float %val, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -25,18 +25,18 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float addrspace(1)* %in.gep.1, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store float %val1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -48,14 +48,14 @@ define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float addrspace(1)* %in0.gep, align 4
-  %val1 = load float addrspace(1)* %in1.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store float %val1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -67,14 +67,14 @@ define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float addrspace(1)* %in0.gep, align 4
-  %val1 = load float addrspace(1)* %in1.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -84,20 +84,20 @@ define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float
 ; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
 ; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr <2 x float> addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr <2 x float> addrspace(1)* %in.gep.0, i32 1
-  %val0 = load <2 x float> addrspace(1)* %in.gep.0, align 8
-  %val1 = load <2 x float> addrspace(1)* %in.gep.1, align 8
+  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
+  %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
+  %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
   %val0.0 = extractelement <2 x float> %val0, i32 0
   %val1.1 = extractelement <2 x float> %val1, i32 1
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0.0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store float %val1.1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -105,18 +105,18 @@ define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2
 ; SI-LABEL: @simple_write2_two_val_subreg2_f32
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr <2 x float> addrspace(1)* %in, i32 %x.i
-  %val = load <2 x float> addrspace(1)* %in.gep, align 8
+  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
   %val0 = extractelement <2 x float> %val, i32 0
   %val1 = extractelement <2 x float> %val, i32 1
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store float %val1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -124,18 +124,18 @@ define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x floa
 ; SI-LABEL: @simple_write2_two_val_subreg4_f32
 ; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr <4 x float> addrspace(1)* %in, i32 %x.i
-  %val = load <4 x float> addrspace(1)* %in.gep, align 16
+  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
   %val0 = extractelement <4 x float> %val, i32 0
   %val1 = extractelement <4 x float> %val, i32 3
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store float %val1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -144,18 +144,18 @@ define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x floa
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255 [M0]
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
 define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float addrspace(1)* %in.gep.1, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 255
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store float %val1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -166,43 +166,43 @@ define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float
 ; SI: s_endpgm
 define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float addrspace(1)* %in0.gep, align 4
-  %val1 = load float addrspace(1)* %in1.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 257
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store float %val1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
 
 ; SI-LABEL: @simple_write2_two_val_f32_x2
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:0 offset1:8
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
 ; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %tid.x
-  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %tid.x
-  %val0 = load float addrspace(1)* %in0.gep, align 4
-  %val1 = load float addrspace(1)* %in1.gep, align 4
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
 
   %idx.0 = add nsw i32 %tid.x, 0
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
   store float %val0, float addrspace(3)* %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
   store float %val1, float addrspace(3)* %arrayidx1, align 4
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
   store float %val0, float addrspace(3)* %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
   store float %val1, float addrspace(3)* %arrayidx3, align 4
 
   ret void
@@ -214,25 +214,25 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %tid.x
-  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %tid.x
-  %val0 = load float addrspace(1)* %in0.gep, align 4
-  %val1 = load float addrspace(1)* %in1.gep, align 4
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
 
   %idx.0 = add nsw i32 %tid.x, 3
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
   store float %val0, float addrspace(3)* %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
   store float %val1, float addrspace(3)* %arrayidx1, align 4
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
   store float %val0, float addrspace(3)* %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
   store float %val1, float addrspace(3)* %arrayidx3, align 4
 
   ret void
@@ -245,19 +245,19 @@ define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, f
 ; SI: s_endpgm
 define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float addrspace(1)* %in0.gep, align 4
-  %val1 = load float addrspace(1)* %in1.gep, align 4
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
 
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
-  %gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
   %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
   %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
 
   ; Apply an additional offset after the vector that will be more obviously folded.
-  %gep.1.offset = getelementptr float addrspace(3)* %gep.1, i32 8
+  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
   store float %val0, float addrspace(3)* %gep.0, align 4
 
   %add.x = add nsw i32 %x.i, 8
@@ -268,16 +268,16 @@ define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float add
 ; SI-LABEL: @simple_write2_one_val_f64
 ; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
-  %val = load double addrspace(1)* %in.gep, align 8
-  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %val = load double, double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   store double %val, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
   store double %val, double addrspace(3)* %arrayidx1, align 8
   ret void
 }
@@ -285,17 +285,17 @@ define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace
 ; SI-LABEL: @misaligned_simple_write2_one_val_f64
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:1 [M0]
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
 ; SI: s_endpgm
 define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
-  %val = load double addrspace(1)* %in.gep, align 8
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %val = load double, double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   store double %val, double addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 7
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
   store double %val, double addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -304,18 +304,18 @@ define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, doubl
 ; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 [M0]
+; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
 define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr double addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr double addrspace(1)* %in.gep.0, i32 1
-  %val0 = load double addrspace(1)* %in.gep.0, align 8
-  %val1 = load double addrspace(1)* %in.gep.1, align 8
-  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   store double %val0, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
   store double %val1, double addrspace(3)* %arrayidx1, align 8
   ret void
 }
@@ -324,20 +324,20 @@ define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace
 
 ; SI-LABEL: @store_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 define void @store_constant_adjacent_offsets() {
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   ret void
 }
 
 ; SI-LABEL: @store_constant_disjoint_offsets
 ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
 ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset0:0 offset1:2
+; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
 define void @store_constant_disjoint_offsets() {
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   ret void
 }
 
@@ -345,11 +345,11 @@ define void @store_constant_disjoint_offsets() {
 
 ; SI-LABEL: @store_misaligned64_constant_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 define void @store_misaligned64_constant_offsets() {
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   ret void
 }
 
@@ -358,12 +358,12 @@ define void @store_misaligned64_constant_offsets() {
 ; SI-LABEL: @store_misaligned64_constant_large_offsets
 ; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
 ; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
-; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
-; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: s_endpgm
 define void @store_misaligned64_constant_large_offsets() {
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   ret void
 }
 
@@ -373,34 +373,34 @@ define void @store_misaligned64_constant_large_offsets() {
 define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
   %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
-  %val = load float addrspace(1)* %in
-  %arrayidx44 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  %val = load float, float addrspace(1)* %in
+  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
   store float %val, float addrspace(3)* %arrayidx44, align 4
   %add47 = add nsw i32 %x.i, 1
-  %arrayidx48 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
   store float %val, float addrspace(3)* %arrayidx48, align 4
   %add51 = add nsw i32 %x.i, 16
-  %arrayidx52 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
   store float %val, float addrspace(3)* %arrayidx52, align 4
   %add55 = add nsw i32 %x.i, 17
-  %arrayidx56 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
   store float %val, float addrspace(3)* %arrayidx56, align 4
-  %arrayidx60 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
   store float %val, float addrspace(3)* %arrayidx60, align 4
   %add63 = add nsw i32 %y.i, 1
-  %arrayidx64 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
   store float %val, float addrspace(3)* %arrayidx64, align 4
   %add67 = add nsw i32 %y.i, 32
-  %arrayidx68 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
   store float %val, float addrspace(3)* %arrayidx68, align 4
   %add71 = add nsw i32 %y.i, 33
-  %arrayidx72 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
   store float %val, float addrspace(3)* %arrayidx72, align 4
   %add75 = add nsw i32 %y.i, 64
-  %arrayidx76 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
   store float %val, float addrspace(3)* %arrayidx76, align 4
   %add79 = add nsw i32 %y.i, 65
-  %arrayidx80 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
   store float %val, float addrspace(3)* %arrayidx80, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/ds_write2st64.ll b/test/CodeGen/R600/ds_write2st64.ll
index de5f4efcbcd3..1d9d881c5c7e 100644
--- a/test/CodeGen/R600/ds_write2st64.ll
+++ b/test/CodeGen/R600/ds_write2st64.ll
@@ -7,16 +7,16 @@
 ; SI-LABEL: @simple_write2st64_one_val_f32_0_1
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:1 [M0]
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
 ; SI: s_endpgm
 define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr float addrspace(1)* %in, i32 %x.i
-  %val = load float addrspace(1)* %in.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %val = load float, float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   store float %val, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
   store float %val, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -25,19 +25,19 @@ define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float add
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 [M0]
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float addrspace(1)* %in.gep.1, align 4
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
   %add.x.0 = add nsw i32 %x.i, 128
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 320
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
   store float %val1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -46,18 +46,18 @@ define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float add
 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255 [M0]
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load float addrspace(1)* %in.gep.0, align 4
-  %val1 = load float addrspace(1)* %in.gep.1, align 4
-  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %x.i
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   store float %val0, float addrspace(3)* %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 16320
-  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
   store float %val1, float addrspace(3)* %arrayidx1, align 4
   ret void
 }
@@ -66,35 +66,35 @@ define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, fl
 ; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
-; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 [M0]
+; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; SI: s_endpgm
 define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep.0 = getelementptr double addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr double addrspace(1)* %in.gep.0, i32 1
-  %val0 = load double addrspace(1)* %in.gep.0, align 8
-  %val1 = load double addrspace(1)* %in.gep.1, align 8
+  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
   %add.x.0 = add nsw i32 %x.i, 256
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
   store double %val0, double addrspace(3)* %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 8128
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
   store double %val1, double addrspace(3)* %arrayidx1, align 8
   ret void
 }
 
 ; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
 ; SI-NOT: ds_write2st64_b64
-; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:0 offset1:8
+; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
 ; SI: s_endpgm
 define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
-  %in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
-  %val = load double addrspace(1)* %in.gep, align 8
-  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %val = load double, double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   store double %val, double addrspace(3)* %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
   store double %val, double addrspace(3)* %arrayidx1, align 8
   ret void
 }
diff --git a/test/CodeGen/R600/elf.ll b/test/CodeGen/R600/elf.ll
index f801b3f57357..d0fd06a34379 100644
--- a/test/CodeGen/R600/elf.ll
+++ b/test/CodeGen/R600/elf.ll
@@ -5,6 +5,9 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
 ; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
 
+; Test that we don't try to produce a COFF file on windows
+; RUN: llc < %s -mtriple=amdgcn-pc-mingw -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+
 ; ELF: Format: ELF32
 ; ELF: Name: .AMDGPU.config
 ; ELF: Type: SHT_PROGBITS
@@ -13,12 +16,12 @@
 ; ELF: Name: test
 ; ELF: Binding: Global
 
-; CONFIG: .align 256
-; CONFIG: test:
 ; CONFIG: .section .AMDGPU.config
 ; CONFIG-NEXT: .long   45096
 ; TYPICAL-NEXT: .long   0
 ; TONGA-NEXT: .long   576
+; CONFIG: .align 256
+; CONFIG: test:
 define void @test(i32 %p) #0 {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
diff --git a/test/CodeGen/R600/empty-function.ll b/test/CodeGen/R600/empty-function.ll
index b5593eb87ae4..a060900811ea 100644
--- a/test/CodeGen/R600/empty-function.ll
+++ b/test/CodeGen/R600/empty-function.ll
@@ -3,16 +3,16 @@
 
 ; Make sure we don't assert on empty functions
 
-; SI-LABEL: {{^}}empty_function_ret:
 ; SI: .text
+; SI-LABEL: {{^}}empty_function_ret:
 ; SI: s_endpgm
 ; SI: codeLenInByte = 4
 define void @empty_function_ret() #0 {
   ret void
 }
 
-; SI-LABEL: {{^}}empty_function_unreachable:
 ; SI: .text
+; SI-LABEL: {{^}}empty_function_unreachable:
 ; SI: codeLenInByte = 0
 define void @empty_function_unreachable() #0 {
   unreachable
diff --git a/test/CodeGen/R600/endcf-loop-header.ll b/test/CodeGen/R600/endcf-loop-header.ll
index e3c5b3c1c364..267a323c5063 100644
--- a/test/CodeGen/R600/endcf-loop-header.ll
+++ b/test/CodeGen/R600/endcf-loop-header.ll
@@ -28,7 +28,7 @@ loop:
   br i1 %tmp2, label %done, label %loop
 
 done:
-  %tmp3 = getelementptr i32 addrspace(1)* %out, i64 1
+  %tmp3 = getelementptr i32, i32 addrspace(1)* %out, i64 1
   store i32 %inc, i32 addrspace(1)* %tmp3
   ret void
 }
diff --git a/test/CodeGen/R600/extload-private.ll b/test/CodeGen/R600/extload-private.ll
index fec868232507..294c3a9c6782 100644
--- a/test/CodeGen/R600/extload-private.ll
+++ b/test/CodeGen/R600/extload-private.ll
@@ -6,7 +6,7 @@
 define void @load_i8_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
-  %tmp1 = load i8* %tmp0
+  %tmp1 = load i8, i8* %tmp0
   %tmp2 = sext i8 %tmp1 to i32
   store i32 %tmp2, i32 addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@ entry:
 define void @load_i8_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
-  %tmp1 = load i8* %tmp0
+  %tmp1 = load i8, i8* %tmp0
   %tmp2 = zext i8 %tmp1 to i32
   store i32 %tmp2, i32 addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ entry:
 define void @load_i16_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
-  %tmp1 = load i16* %tmp0
+  %tmp1 = load i16, i16* %tmp0
   %tmp2 = sext i16 %tmp1 to i32
   store i32 %tmp2, i32 addrspace(1)* %out
   ret void
@@ -39,7 +39,7 @@ entry:
 define void @load_i16_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
-  %tmp1 = load i16* %tmp0
+  %tmp1 = load i16, i16* %tmp0
   %tmp2 = zext i16 %tmp1 to i32
   store i32 %tmp2, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
index 73d6701bfb5b..662eb7a9716b 100644
--- a/test/CodeGen/R600/extload.ll
+++ b/test/CodeGen/R600/extload.ll
@@ -3,11 +3,12 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}anyext_load_i8:
-; EG: AND_INT
-; EG: 255
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
+; EG: VTX_READ_32 [[VAL]]
+
 define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
-  %load = load i32 addrspace(1)* %cast, align 1
+  %load = load i32, i32 addrspace(1)* %cast, align 1
   %x = bitcast i32 %load to <4 x i8>
   %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)*
   store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1
@@ -15,13 +16,12 @@ define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspac
 }
 
 ; FUNC-LABEL: {{^}}anyext_load_i16:
-; EG: AND_INT
-; EG: AND_INT
-; EG-DAG: 65535
-; EG-DAG: -65536
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
+; EG: VTX_READ_32 [[VAL]]
+
 define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
-  %load = load i32 addrspace(1)* %cast, align 1
+  %load = load i32, i32 addrspace(1)* %cast, align 1
   %x = bitcast i32 %load to <2 x i16>
   %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)*
   store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1
@@ -29,11 +29,11 @@ define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrs
 }
 
 ; FUNC-LABEL: {{^}}anyext_load_lds_i8:
-; EG: AND_INT
-; EG: 255
+; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
+; EG: LDS_WRITE * [[VAL]]
 define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
-  %load = load i32 addrspace(3)* %cast, align 1
+  %load = load i32, i32 addrspace(3)* %cast, align 1
   %x = bitcast i32 %load to <4 x i8>
   %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)*
   store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1
@@ -41,13 +41,11 @@ define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addr
 }
 
 ; FUNC-LABEL: {{^}}anyext_load_lds_i16:
-; EG: AND_INT
-; EG: AND_INT
-; EG-DAG: 65535
-; EG-DAG: -65536
+; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
+; EG: LDS_WRITE * [[VAL]]
 define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
-  %load = load i32 addrspace(3)* %cast, align 1
+  %load = load i32, i32 addrspace(3)* %cast, align 1
   %x = bitcast i32 %load to <2 x i16>
   %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)*
   store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
diff --git a/test/CodeGen/R600/extract_vector_elt_i16.ll b/test/CodeGen/R600/extract_vector_elt_i16.ll
index 0774a9ae852b..c7572efc6f5b 100644
--- a/test/CodeGen/R600/extract_vector_elt_i16.ll
+++ b/test/CodeGen/R600/extract_vector_elt_i16.ll
@@ -9,7 +9,7 @@
 define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
   %p0 = extractelement <2 x i16> %foo, i32 0
   %p1 = extractelement <2 x i16> %foo, i32 1
-  %out1 = getelementptr i16 addrspace(1)* %out, i32 1
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p1, i16 addrspace(1)* %out, align 2
   store i16 %p0, i16 addrspace(1)* %out1, align 2
   ret void
@@ -23,7 +23,7 @@ define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) no
 define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2
-  %out1 = getelementptr i16 addrspace(1)* %out, i32 1
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p1, i16 addrspace(1)* %out, align 2
   store i16 %p0, i16 addrspace(1)* %out1, align 2
   ret void
diff --git a/test/CodeGen/R600/fabs.f64.ll b/test/CodeGen/R600/fabs.f64.ll
index d87c08260b4c..3c6136c1a7bd 100644
--- a/test/CodeGen/R600/fabs.f64.ll
+++ b/test/CodeGen/R600/fabs.f64.ll
@@ -13,8 +13,8 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
 define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %tidext = sext i32 %tid to i64
-  %gep = getelementptr double addrspace(1)* %in, i64 %tidext
-  %val = load double addrspace(1)* %gep, align 8
+  %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
+  %val = load double, double addrspace(1)* %gep, align 8
   %fabs = call double @llvm.fabs.f64(double %val)
   store double %fabs, double addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index add6b75d22ae..419a73d02669 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 
@@ -10,7 +11,7 @@
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; SI: v_and_b32
+; GCN: v_and_b32
 
 define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
@@ -23,7 +24,7 @@ define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; SI: v_and_b32
+; GCN: v_and_b32
 
 define void @fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
@@ -35,7 +36,7 @@ define void @fabs_free(float addrspace(1)* %out, i32 %in) {
 ; FUNC-LABEL: {{^}}fabs_f32:
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; SI: v_and_b32
+; GCN: v_and_b32
 define void @fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   store float %fabs, float addrspace(1)* %out
@@ -46,8 +47,8 @@ define void @fabs_f32(float addrspace(1)* %out, float %in) {
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; SI: v_and_b32
-; SI: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
 define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
@@ -60,20 +61,21 @@ define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; SI: v_and_b32
-; SI: v_and_b32
-; SI: v_and_b32
-; SI: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
 define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   store <4 x float> %fabs, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: {{^}}fabs_fn_fold:
+; GCN-LABEL: {{^}}fabs_fn_fold:
 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; SI-NOT: and
-; SI: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: and
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
 define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
@@ -81,10 +83,11 @@ define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   ret void
 }
 
-; SI-LABEL: {{^}}fabs_fold:
+; GCN-LABEL: {{^}}fabs_fold:
 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; SI-NOT: and
-; SI: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: and
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
 define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 365af9b73cc0..5fac328c5981 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -32,9 +32,9 @@ define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo
 ; SI: v_add_f32
 ; SI: v_add_f32
 define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1)* %in, align 16
-  %b = load <4 x float> addrspace(1)* %b_ptr, align 16
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
   %result = fadd <4 x float> %a, %b
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/R600/fadd64.ll b/test/CodeGen/R600/fadd64.ll
index f1f6fef54766..485c55870c47 100644
--- a/test/CodeGen/R600/fadd64.ll
+++ b/test/CodeGen/R600/fadd64.ll
@@ -6,8 +6,8 @@
 
 define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
    %r2 = fadd double %r0, %r1
    store double %r2, double addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll
index 77cd8eae402c..e8c34f0141e4 100644
--- a/test/CodeGen/R600/fceil64.ll
+++ b/test/CodeGen/R600/fceil64.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
 declare double @llvm.ceil.f64(double) nounwind readnone
 declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
@@ -11,19 +12,19 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; FUNC-LABEL: {{^}}fceil_f64:
 ; CI: v_ceil_f64_e32
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
 ; SI: s_lshr_b64
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: cmp_lt_i32
+; SI: cmp_gt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
-; SI: cmp_gt_i32
+; SI: cmp_lt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
-; SI: v_cmp_lg_f64
-; SI: v_cmp_gt_f64
+; SI-DAG: v_cmp_lt_f64
+; SI-DAG: v_cmp_lg_f64
 ; SI: s_and_b64
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
index 1d4e323d3abf..530274f920f0 100644
--- a/test/CodeGen/R600/fcmp-cnd.ll
+++ b/test/CodeGen/R600/fcmp-cnd.ll
@@ -6,7 +6,7 @@
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
-  %0 = load float addrspace(1)* %in
+  %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
   %value = select i1 %cmp, i32 2, i32 3 
   store i32 %value, i32 addrspace(1)* %out
diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll b/test/CodeGen/R600/fcmp-cnde-int-args.ll
index 55aba0d72d39..c402805feb39 100644
--- a/test/CodeGen/R600/fcmp-cnde-int-args.ll
+++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
@@ -8,7 +8,7 @@
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
-  %0 = load float addrspace(1)* %in
+  %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
   %value = select i1 %cmp, i32 -1, i32 0
   store i32 %value, i32 addrspace(1)* %out
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
index 33992181e0d9..5207ab57bade 100644
--- a/test/CodeGen/R600/fcmp.ll
+++ b/test/CodeGen/R600/fcmp.ll
@@ -5,9 +5,9 @@
 
 define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
-  %0 = load float addrspace(1)* %in
-  %arrayidx1 = getelementptr inbounds float addrspace(1)* %in, i32 1
-  %1 = load float addrspace(1)* %arrayidx1
+  %0 = load float, float addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
+  %1 = load float, float addrspace(1)* %arrayidx1
   %cmp = fcmp oeq float %0, %1
   %sext = sext i1 %cmp to i32
   store i32 %sext, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ entry:
   br i1 %0, label %IF, label %ENDIF
 
 IF:
-  %1 = getelementptr i32 addrspace(1)* %out, i32 1
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   store i32 0, i32 addrspace(1)* %1
   br label %ENDIF
 
diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll
index 9dc8b50513f2..053ab0ed7aaf 100644
--- a/test/CodeGen/R600/fcmp64.ll
+++ b/test/CodeGen/R600/fcmp64.ll
@@ -5,8 +5,8 @@
 ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
    %r2 = fcmp ult double %r0, %r1
    %r3 = zext i1 %r2 to i32
    store i32 %r3, i32 addrspace(1)* %out
@@ -17,8 +17,8 @@ define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
    %r2 = fcmp ule double %r0, %r1
    %r3 = zext i1 %r2 to i32
    store i32 %r3, i32 addrspace(1)* %out
@@ -29,8 +29,8 @@ define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
    %r2 = fcmp ugt double %r0, %r1
    %r3 = zext i1 %r2 to i32
    store i32 %r3, i32 addrspace(1)* %out
@@ -41,8 +41,8 @@ define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
    %r2 = fcmp uge double %r0, %r1
    %r3 = zext i1 %r2 to i32
    store i32 %r3, i32 addrspace(1)* %out
@@ -53,8 +53,8 @@ define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
    %r2 = fcmp une double %r0, %r1
    %r3 = select i1 %r2, double %r0, double %r1
    store double %r3, double addrspace(1)* %out
@@ -65,8 +65,8 @@ define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
    %r2 = fcmp ueq double %r0, %r1
    %r3 = select i1 %r2, double %r0, double %r1
    store double %r3, double addrspace(1)* %out
diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
index 28e0c909747f..89af37545c99 100644
--- a/test/CodeGen/R600/fconst64.ll
+++ b/test/CodeGen/R600/fconst64.ll
@@ -6,7 +6,7 @@
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
 
 define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-   %r1 = load double addrspace(1)* %in
+   %r1 = load double, double addrspace(1)* %in
    %r2 = fadd double %r1, 5.000000e+00
    store double %r2, double addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/R600/fcopysign.f32.ll b/test/CodeGen/R600/fcopysign.f32.ll
index 4bc5145bd4de..b719d5a39785 100644
--- a/test/CodeGen/R600/fcopysign.f32.ll
+++ b/test/CodeGen/R600/fcopysign.f32.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
@@ -10,12 +11,14 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
 ; FUNC-LABEL: {{^}}test_copysign_f32:
 ; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
 ; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
-; SI-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
-; SI-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
-; SI-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
-; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
+; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
+; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
+; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: BFI_INT
 define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
@@ -25,7 +28,7 @@ define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG: BFI_INT
 ; EG: BFI_INT
@@ -36,7 +39,7 @@ define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %ma
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG: BFI_INT
 ; EG: BFI_INT
diff --git a/test/CodeGen/R600/fcopysign.f64.ll b/test/CodeGen/R600/fcopysign.f64.ll
index a14a493f72c8..3d8c55993089 100644
--- a/test/CodeGen/R600/fcopysign.f64.ll
+++ b/test/CodeGen/R600/fcopysign.f64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 declare double @llvm.copysign.f64(double, double) nounwind readnone
 declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
@@ -7,13 +8,15 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
 ; FUNC-LABEL: {{^}}test_copysign_f64:
 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
-; SI-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
-; SI-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
-; SI: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
-; SI: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
-; SI: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-; SI: s_endpgm
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
+; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
+; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; GCN: s_endpgm
 define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
@@ -21,7 +24,7 @@ define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %s
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f64:
-; SI: s_endpgm
+; GCN: s_endpgm
 define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
@@ -29,7 +32,7 @@ define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f64:
-; SI: s_endpgm
+; GCN: s_endpgm
 define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/fdiv.f64.ll b/test/CodeGen/R600/fdiv.f64.ll
index 276642f99014..7c022e38c808 100644
--- a/test/CodeGen/R600/fdiv.f64.ll
+++ b/test/CodeGen/R600/fdiv.f64.ll
@@ -25,14 +25,14 @@
 ; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]]
 ; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]]
 ; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]]
-; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA3]], [[FMA4]], [[MUL]]
+; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]]
 ; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
 ; COMMON: buffer_store_dwordx2 [[RESULT]]
 ; COMMON: s_endpgm
 define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind {
-  %gep.1 = getelementptr double addrspace(1)* %in, i32 1
-  %num = load double addrspace(1)* %in
-  %den = load double addrspace(1)* %gep.1
+  %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
+  %num = load double, double addrspace(1)* %in
+  %den = load double, double addrspace(1)* %gep.1
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounw
 
 ; COMMON-LABEL: {{^}}fdiv_f64_s_v:
 define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind {
-  %den = load double addrspace(1)* %in
+  %den = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, d
 
 ; COMMON-LABEL: {{^}}fdiv_f64_v_s:
 define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind {
-  %num = load double addrspace(1)* %in
+  %num = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
@@ -63,9 +63,9 @@ define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) n
 
 ; COMMON-LABEL: {{^}}v_fdiv_v2f64:
 define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind {
-  %gep.1 = getelementptr <2 x double> addrspace(1)* %in, i32 1
-  %num = load <2 x double> addrspace(1)* %in
-  %den = load <2 x double> addrspace(1)* %gep.1
+  %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1
+  %num = load <2 x double>, <2 x double> addrspace(1)* %in
+  %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1
   %result = fdiv <2 x double> %num, %den
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
@@ -80,9 +80,9 @@ define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2
 
 ; COMMON-LABEL: {{^}}v_fdiv_v4f64:
 define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind {
-  %gep.1 = getelementptr <4 x double> addrspace(1)* %in, i32 1
-  %num = load <4 x double> addrspace(1)* %in
-  %den = load <4 x double> addrspace(1)* %gep.1
+  %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
+  %num = load <4 x double>, <4 x double> addrspace(1)* %in
+  %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1
   %result = fdiv <4 x double> %num, %den
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 603287fbdf4f..7cbf87336399 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -59,9 +59,9 @@ entry:
 ; SI-DAG: v_rcp_f32
 ; SI-DAG: v_mul_f32
 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1) * %in
-  %b = load <4 x float> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1) * %in
+  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
   %result = fdiv <4 x float> %a, %b
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/fetch-limits.r600.ll b/test/CodeGen/R600/fetch-limits.r600.ll
index d35573e818d4..e7160ef5d726 100644
--- a/test/CodeGen/R600/fetch-limits.r600.ll
+++ b/test/CodeGen/R600/fetch-limits.r600.ll
@@ -9,15 +9,15 @@
 
 define void @fetch_limits_r600() #0 {
 entry:
-  %0 = load <4 x float> addrspace(8)* null
-  %1 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %3 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %5 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %6 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %7 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
   %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
   %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
   %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
diff --git a/test/CodeGen/R600/fetch-limits.r700+.ll b/test/CodeGen/R600/fetch-limits.r700+.ll
index 17760a05caa4..acaea2aa7943 100644
--- a/test/CodeGen/R600/fetch-limits.r700+.ll
+++ b/test/CodeGen/R600/fetch-limits.r700+.ll
@@ -18,23 +18,23 @@
 
 define void @fetch_limits_r700() #0 {
 entry:
-  %0 = load <4 x float> addrspace(8)* null
-  %1 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %3 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %5 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %6 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %7 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %9 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %11 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
-  %13 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
-  %14 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %15 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
-  %16 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %9 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %11 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
+  %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
+  %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
   %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
   %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
   %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
diff --git a/test/CodeGen/R600/ffloor.f64.ll b/test/CodeGen/R600/ffloor.f64.ll
new file mode 100644
index 000000000000..45f8382c3929
--- /dev/null
+++ b/test/CodeGen/R600/ffloor.f64.ll
@@ -0,0 +1,127 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.fabs.f64(double %Val)
+declare double @llvm.floor.f64(double) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}ffloor_f64:
+; CI: v_floor_f64_e32
+; SI: v_fract_f64_e32
+; SI: v_min_f64
+; SI: v_cmp_class_f64_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_add_f64
+; SI: s_endpgm
+define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.floor.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_f64_neg:
+; CI: v_floor_f64_e64
+; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]]
+; SI: v_min_f64
+; SI: v_cmp_class_f64_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
+; SI: s_endpgm
+define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
+  %neg = fsub double 0.0, %x
+  %y = call double @llvm.floor.f64(double %neg) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_f64_neg_abs:
+; CI: v_floor_f64_e64
+; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]|
+; SI: v_min_f64
+; SI: v_cmp_class_f64_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
+; SI: s_endpgm
+define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
+  %abs = call double @llvm.fabs.f64(double %x)
+  %neg = fsub double 0.0, %abs
+  %y = call double @llvm.floor.f64(double %neg) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v2f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ffloor_v4f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v8f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v16f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
index 9038ff81b073..61c46ac2bc03 100644
--- a/test/CodeGen/R600/ffloor.ll
+++ b/test/CodeGen/R600/ffloor.ll
@@ -1,106 +1,49 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-
-declare double @llvm.floor.f64(double) nounwind readnone
-declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
-declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
-declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
-declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
-declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}floor_f32:
+; SI: v_floor_f32_e32
+; R600: FLOOR
+define void @floor_f32(float addrspace(1)* %out, float %in) {
+  %tmp = call float @llvm.floor.f32(float %in) #0
+  store float %tmp, float addrspace(1)* %out
+  ret void
+}
 
-; FUNC-LABEL: {{^}}ffloor_f64:
-; CI: v_floor_f64_e32
+; FUNC-LABEL: {{^}}floor_v2f32:
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
 
-; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: s_lshr_b64
-; SI: s_not_b64
-; SI: s_and_b64
-; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: v_cmp_lg_f64
-; SI: v_cmp_lt_f64
-; SI: s_and_b64
-; SI: v_cndmask_b32
-; SI: v_cndmask_b32
-; SI: v_add_f64
-; SI: s_endpgm
-define void @ffloor_f64(double addrspace(1)* %out, double %x) {
-  %y = call double @llvm.floor.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
+define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
+  store <2 x float> %tmp, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}ffloor_v2f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
-  %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
+; FUNC-LABEL: {{^}}floor_v4f32:
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+
+; R600: FLOOR
+; R600: FLOOR
+; R600: FLOOR
+; R600: FLOOR
+define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
+  store <4 x float> %tmp, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
-;   ret void
-; }
+; Function Attrs: nounwind readonly
+declare float @llvm.floor.f32(float) #0
 
-; FUNC-LABEL: {{^}}ffloor_v4f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
-  %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
-  ret void
-}
+; Function Attrs: nounwind readonly
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0
 
-; FUNC-LABEL: {{^}}ffloor_v8f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
-  %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
-  ret void
-}
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0
 
-; FUNC-LABEL: {{^}}ffloor_v16f64:
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-; CI: v_floor_f64_e32
-define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
-  %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
-  ret void
-}
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/flat-address-space.ll b/test/CodeGen/R600/flat-address-space.ll
index 2e98bf51b23b..425d67d5b07c 100644
--- a/test/CodeGen/R600/flat-address-space.ll
+++ b/test/CodeGen/R600/flat-address-space.ll
@@ -26,7 +26,7 @@ global:
 end:
   %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
   store i32 %x, i32 addrspace(4)* %fptr, align 4
-;  %val = load i32 addrspace(4)* %fptr, align 4
+;  %val = load i32, i32 addrspace(4)* %fptr, align 4
 ;  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -87,7 +87,7 @@ define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
 ; CHECK: flat_load_dword
 define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  %fload = load i32 addrspace(4)* %fptr, align 4
+  %fload = load i32, i32 addrspace(4)* %fptr, align 4
   store i32 %fload, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -96,7 +96,7 @@ define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noa
 ; CHECK: flat_load_dwordx2
 define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
-  %fload = load i64 addrspace(4)* %fptr, align 4
+  %fload = load i64, i64 addrspace(4)* %fptr, align 4
   store i64 %fload, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -105,7 +105,7 @@ define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
 ; CHECK: flat_load_dwordx4
 define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
-  %fload = load <4 x i32> addrspace(4)* %fptr, align 4
+  %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4
   store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
   ret void
 }
@@ -114,7 +114,7 @@ define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> add
 ; CHECK: flat_load_sbyte
 define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
-  %fload = load i8 addrspace(4)* %fptr, align 4
+  %fload = load i8, i8 addrspace(4)* %fptr, align 4
   %ext = sext i8 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -124,7 +124,7 @@ define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
 ; CHECK: flat_load_ubyte
 define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
-  %fload = load i8 addrspace(4)* %fptr, align 4
+  %fload = load i8, i8 addrspace(4)* %fptr, align 4
   %ext = zext i8 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -134,7 +134,7 @@ define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
 ; CHECK: flat_load_sshort
 define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
-  %fload = load i16 addrspace(4)* %fptr, align 4
+  %fload = load i16, i16 addrspace(4)* %fptr, align 4
   %ext = sext i16 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -144,7 +144,7 @@ define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
 ; CHECK: flat_load_ushort
 define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
-  %fload = load i16 addrspace(4)* %fptr, align 4
+  %fload = load i16, i16 addrspace(4)* %fptr, align 4
   %ext = zext i16 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -166,12 +166,12 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
 define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
   %alloca = alloca i32, i32 9, align 4
   %x = call i32 @llvm.r600.read.tidig.x() #3
-  %pptr = getelementptr i32* %alloca, i32 %x
+  %pptr = getelementptr i32, i32* %alloca, i32 %x
   %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
   store i32 %x, i32 addrspace(4)* %fptr
   ; Dummy call
   call void @llvm.AMDGPU.barrier.local() #1
-  %reload = load i32 addrspace(4)* %fptr, align 4
+  %reload = load i32, i32 addrspace(4)* %fptr, align 4
   store i32 %reload, i32 addrspace(1)* %out, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
index 67e86c41fdcf..c6bfb8567a0f 100644
--- a/test/CodeGen/R600/floor.ll
+++ b/test/CodeGen/R600/floor.ll
@@ -1,7 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
 
+; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 define void @test(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @floor(float %r0)
@@ -13,4 +12,4 @@ define void @test(<4 x float> inreg %reg0) #0 {
 declare float @floor(float) readonly
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="0" }
-\ No newline at end of file
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/fma-combine.ll b/test/CodeGen/R600/fma-combine.ll
new file mode 100644
index 000000000000..bd574b877117
--- /dev/null
+++ b/test/CodeGen/R600/fma-combine.ll
@@ -0,0 +1,368 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare double @llvm.fabs.f64(double) #0
+declare double @llvm.fma.f64(double, double, double) #0
+declare float @llvm.fma.f32(float, float, float) #0
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fadd double %mul, %c
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fadd double %mul, %c
+  %fma1 = fadd double %mul, %d
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fadd x, (fmul y, z)) -> (fma y, z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fadd double %c, %mul
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fsub double %mul, %c
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fsub double %mul, %c
+  %fma1 = fsub double %mul, %d
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fsub double %c, %mul
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fsub double %c, %mul
+  %fma1 = fsub double %d, %mul
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma = fsub double %mul.neg, %c
+
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma0 = fsub double %mul.neg, %c
+  %fma1 = fsub double %mul.neg, %d
+
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma0 = fsub double %mul.neg, %c
+  %fma1 = fsub double %mul, %d
+
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %x = load double, double addrspace(1)* %gep.0
+  %y = load double, double addrspace(1)* %gep.1
+  %z = load double, double addrspace(1)* %gep.2
+  %u = load double, double addrspace(1)* %gep.3
+  %v = load double, double addrspace(1)* %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+  %tmp2 = fsub double %tmp1, %z
+
+  store double %tmp2, double addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+;   -> (fma (fneg y), z, (fma (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %x = load double, double addrspace(1)* %gep.0
+  %y = load double, double addrspace(1)* %gep.1
+  %z = load double, double addrspace(1)* %gep.2
+  %u = load double, double addrspace(1)* %gep.3
+  %v = load double, double addrspace(1)* %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
+  %tmp2 = fsub double %x, %tmp1
+
+  store double %tmp2, double addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/fma.f64.ll b/test/CodeGen/R600/fma.f64.ll
index bca312bfa751..0a55ef778557 100644
--- a/test/CodeGen/R600/fma.f64.ll
+++ b/test/CodeGen/R600/fma.f64.ll
@@ -10,9 +10,9 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) n
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
-   %r2 = load double addrspace(1)* %in3
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = load double, double addrspace(1)* %in3
    %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
    store double %r3, double addrspace(1)* %out
    ret void
@@ -23,9 +23,9 @@ define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; SI: v_fma_f64
 define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                        <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
-   %r0 = load <2 x double> addrspace(1)* %in1
-   %r1 = load <2 x double> addrspace(1)* %in2
-   %r2 = load <2 x double> addrspace(1)* %in3
+   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
+   %r2 = load <2 x double>, <2 x double> addrspace(1)* %in3
    %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2)
    store <2 x double> %r3, <2 x double> addrspace(1)* %out
    ret void
@@ -38,9 +38,9 @@ define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1
 ; SI: v_fma_f64
 define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                        <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
-   %r0 = load <4 x double> addrspace(1)* %in1
-   %r1 = load <4 x double> addrspace(1)* %in2
-   %r2 = load <4 x double> addrspace(1)* %in3
+   %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
+   %r2 = load <4 x double>, <4 x double> addrspace(1)* %in3
    %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2)
    store <4 x double> %r3, <4 x double> addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll
index f3861ffa2835..d6024aa0b4c5 100644
--- a/test/CodeGen/R600/fma.ll
+++ b/test/CodeGen/R600/fma.ll
@@ -14,9 +14,9 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; EG: FMA {{\*? *}}[[RES]]
 define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
-  %r0 = load float addrspace(1)* %in1
-  %r1 = load float addrspace(1)* %in2
-  %r2 = load float addrspace(1)* %in3
+  %r0 = load float, float addrspace(1)* %in1
+  %r1 = load float, float addrspace(1)* %in2
+  %r2 = load float, float addrspace(1)* %in3
   %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
   store float %r3, float addrspace(1)* %out
   ret void
@@ -31,9 +31,9 @@ define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
 define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                        <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
-  %r0 = load <2 x float> addrspace(1)* %in1
-  %r1 = load <2 x float> addrspace(1)* %in2
-  %r2 = load <2 x float> addrspace(1)* %in3
+  %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1
+  %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2
+  %r2 = load <2 x float>, <2 x float> addrspace(1)* %in3
   %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
   store <2 x float> %r3, <2 x float> addrspace(1)* %out
   ret void
@@ -52,9 +52,9 @@ define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)*
 ; EG-DAG: FMA {{\*? *}}[[RES]].W
 define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                        <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
-  %r0 = load <4 x float> addrspace(1)* %in1
-  %r1 = load <4 x float> addrspace(1)* %in2
-  %r2 = load <4 x float> addrspace(1)* %in3
+  %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1
+  %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2
+  %r2 = load <4 x float>, <4 x float> addrspace(1)* %in3
   %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
   store <4 x float> %r3, <4 x float> addrspace(1)* %out
   ret void
@@ -64,12 +64,12 @@ define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)*
 ; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
 define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
-  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
-  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float addrspace(1)* %in.a.gep, align 4
-  %b = load float addrspace(1)* %in.b.gep, align 4
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
 
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
   store float %fma, float addrspace(1)* %out.gep, align 4
@@ -79,12 +79,12 @@ define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, fl
 ; FUNC-LABEL: @fma_commute_mul_s_f32
 define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
-  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
-  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %a = load float addrspace(1)* %in.a.gep, align 4
-  %c = load float addrspace(1)* %in.b.gep, align 4
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %c = load float, float addrspace(1)* %in.b.gep, align 4
 
   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
   store float %fma, float addrspace(1)* %out.gep, align 4
diff --git a/test/CodeGen/R600/fmax3.f64.ll b/test/CodeGen/R600/fmax3.f64.ll
index 5ca789de2a08..f78c71b28264 100644
--- a/test/CodeGen/R600/fmax3.f64.ll
+++ b/test/CodeGen/R600/fmax3.f64.ll
@@ -12,11 +12,11 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
-  %bptr = getelementptr double addrspace(1)* %aptr, i32 1
-  %cptr = getelementptr double addrspace(1)* %aptr, i32 2
-  %a = load double addrspace(1)* %aptr, align 8
-  %b = load double addrspace(1)* %bptr, align 8
-  %c = load double addrspace(1)* %cptr, align 8
+  %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
+  %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
+  %a = load double, double addrspace(1)* %aptr, align 8
+  %b = load double, double addrspace(1)* %bptr, align 8
+  %c = load double, double addrspace(1)* %cptr, align 8
   %f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone
   %f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone
   store double %f1, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/fmax3.ll b/test/CodeGen/R600/fmax3.ll
index e1b477c5921e..c3028a6217d5 100644
--- a/test/CodeGen/R600/fmax3.ll
+++ b/test/CodeGen/R600/fmax3.ll
@@ -4,16 +4,16 @@
 declare float @llvm.maxnum.f32(float, float) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmax3_olt_0:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float addrspace(1)* %aptr, align 4
-  %b = load float addrspace(1)* %bptr, align 4
-  %c = load float addrspace(1)* %cptr, align 4
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
@@ -22,16 +22,16 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 
 ; Commute operand of second fmax
 ; SI-LABEL: {{^}}test_fmax3_olt_1:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float addrspace(1)* %aptr, align 4
-  %b = load float addrspace(1)* %bptr, align 4
-  %c = load float addrspace(1)* %cptr, align 4
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/fmax_legacy.f64.ll b/test/CodeGen/R600/fmax_legacy.f64.ll
index a615825a45d3..828243888ac7 100644
--- a/test/CodeGen/R600/fmax_legacy.f64.ll
+++ b/test/CodeGen/R600/fmax_legacy.f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; Make sure we don't try to form FMAX_LEGACY nodes with f64
 
 declare i32 @llvm.r600.read.tidig.x() #1
@@ -6,11 +6,11 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; FUNC-LABEL: @test_fmax_legacy_uge_f64
 define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %cmp = fcmp uge double %a, %b
   %val = select i1 %cmp, double %a, double %b
@@ -21,11 +21,11 @@ define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspac
 ; FUNC-LABEL: @test_fmax_legacy_oge_f64
 define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %cmp = fcmp oge double %a, %b
   %val = select i1 %cmp, double %a, double %b
@@ -36,11 +36,11 @@ define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspac
 ; FUNC-LABEL: @test_fmax_legacy_ugt_f64
 define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %cmp = fcmp ugt double %a, %b
   %val = select i1 %cmp, double %a, double %b
@@ -51,11 +51,11 @@ define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspac
 ; FUNC-LABEL: @test_fmax_legacy_ogt_f64
 define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %cmp = fcmp ogt double %a, %b
   %val = select i1 %cmp, double %a, double %b
diff --git a/test/CodeGen/R600/fmax_legacy.ll b/test/CodeGen/R600/fmax_legacy.ll
index 46f0e9831e6a..413957d2982a 100644
--- a/test/CodeGen/R600/fmax_legacy.ll
+++ b/test/CodeGen/R600/fmax_legacy.ll
@@ -15,11 +15,11 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; EG: MAX
 define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp uge float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -35,11 +35,11 @@ define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(
 ; EG: MAX
 define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp oge float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -55,11 +55,11 @@ define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(
 ; EG: MAX
 define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ugt float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -75,11 +75,11 @@ define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(
 ; EG: MAX
 define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ogt float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -99,11 +99,11 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(
 ; EG: MAX
 define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ogt float %a, %b
   %val = select i1 %cmp, float %a, float %b
diff --git a/test/CodeGen/R600/fmaxnum.ll b/test/CodeGen/R600/fmaxnum.ll
index c105598ff811..3029bd02e4db 100644
--- a/test/CodeGen/R600/fmaxnum.ll
+++ b/test/CodeGen/R600/fmaxnum.ll
@@ -11,6 +11,9 @@ declare double @llvm.maxnum.f64(double, double)
 
 ; FUNC-LABEL: @test_fmax_f32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
 define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -20,6 +23,10 @@ define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; FUNC-LABEL: @test_fmax_v2f32
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
 define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
@@ -31,6 +38,12 @@ define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
 define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
@@ -46,6 +59,17 @@ define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
 define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
@@ -69,6 +93,27 @@ define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
 define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
@@ -79,6 +124,10 @@ define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -89,6 +138,11 @@ define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+; EG: 2143289344(nan)
 define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -99,6 +153,10 @@ define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -109,6 +167,10 @@ define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -119,6 +181,10 @@ define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -129,6 +195,10 @@ define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -139,6 +209,10 @@ define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -149,6 +223,10 @@ define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_max_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -157,6 +235,10 @@ define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
 
 ; FUNC-LABEL: @fmax_var_immediate_f32
 ; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -165,6 +247,9 @@ define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; FUNC-LABEL: @fmax_immediate_var_f32
 ; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -174,6 +259,9 @@ define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 ; FUNC-LABEL: @fmax_var_literal_f32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -183,6 +271,9 @@ define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 ; FUNC-LABEL: @fmax_literal_var_f32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/fmin3.ll b/test/CodeGen/R600/fmin3.ll
index 716beb16bb10..0a76699b43e1 100644
--- a/test/CodeGen/R600/fmin3.ll
+++ b/test/CodeGen/R600/fmin3.ll
@@ -5,16 +5,16 @@
 declare float @llvm.minnum.f32(float, float) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmin3_olt_0:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float addrspace(1)* %aptr, align 4
-  %b = load float addrspace(1)* %bptr, align 4
-  %c = load float addrspace(1)* %cptr, align 4
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
@@ -23,16 +23,16 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 
 ; Commute operand of second fmin
 ; SI-LABEL: {{^}}test_fmin3_olt_1:
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
 ; SI: buffer_load_dword [[REGC:v[0-9]+]]
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float addrspace(1)* %aptr, align 4
-  %b = load float addrspace(1)* %bptr, align 4
-  %c = load float addrspace(1)* %cptr, align 4
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/fmin_legacy.f64.ll b/test/CodeGen/R600/fmin_legacy.f64.ll
index 51dcd06f9397..e19a48f3f7e2 100644
--- a/test/CodeGen/R600/fmin_legacy.f64.ll
+++ b/test/CodeGen/R600/fmin_legacy.f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
 
@@ -16,11 +16,11 @@ define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double>
 ; FUNC-LABEL: @test_fmin_legacy_ule_f64
 define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %cmp = fcmp ule double %a, %b
   %val = select i1 %cmp, double %a, double %b
@@ -31,11 +31,11 @@ define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspac
 ; FUNC-LABEL: @test_fmin_legacy_ole_f64
 define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %cmp = fcmp ole double %a, %b
   %val = select i1 %cmp, double %a, double %b
@@ -46,11 +46,11 @@ define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspac
 ; FUNC-LABEL: @test_fmin_legacy_olt_f64
 define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %cmp = fcmp olt double %a, %b
   %val = select i1 %cmp, double %a, double %b
@@ -61,11 +61,11 @@ define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspac
 ; FUNC-LABEL: @test_fmin_legacy_ult_f64
 define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %cmp = fcmp ult double %a, %b
   %val = select i1 %cmp, double %a, double %b
diff --git a/test/CodeGen/R600/fmin_legacy.ll b/test/CodeGen/R600/fmin_legacy.ll
index 5014f6c55329..6a625c239d76 100644
--- a/test/CodeGen/R600/fmin_legacy.ll
+++ b/test/CodeGen/R600/fmin_legacy.ll
@@ -27,11 +27,11 @@ define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> in
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -46,11 +46,11 @@ define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ole float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -65,11 +65,11 @@ define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp olt float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -84,11 +84,11 @@ define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ult float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -106,11 +106,11 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(
 ; SI: s_endpgm
 define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %cmp = fcmp ole float %a, %b
   %val0 = select i1 %cmp, float %a, float %b
diff --git a/test/CodeGen/R600/fminnum.ll b/test/CodeGen/R600/fminnum.ll
index 6b93b830033b..4d7b52540d85 100644
--- a/test/CodeGen/R600/fminnum.ll
+++ b/test/CodeGen/R600/fminnum.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.minnum.f32(float, float) #0
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
@@ -9,6 +10,9 @@ declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
 
 ; FUNC-LABEL: @test_fmin_f32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
 define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -18,6 +22,10 @@ define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; FUNC-LABEL: @test_fmin_v2f32
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
 define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
@@ -29,6 +37,12 @@ define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
 define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
@@ -44,6 +58,17 @@ define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
 define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
@@ -67,6 +92,27 @@ define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
 ; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
 define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
@@ -77,6 +123,10 @@ define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -87,6 +137,11 @@ define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
 define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -97,6 +152,10 @@ define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -107,6 +166,10 @@ define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -117,6 +180,10 @@ define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -127,6 +194,10 @@ define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -137,6 +208,10 @@ define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -147,6 +222,10 @@ define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; SI-NOT: v_min_f32_e32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -155,6 +234,9 @@ define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
 
 ; FUNC-LABEL: @fmin_var_immediate_f32
 ; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -163,6 +245,9 @@ define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; FUNC-LABEL: @fmin_immediate_var_f32
 ; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -172,6 +257,9 @@ define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 ; FUNC-LABEL: @fmin_var_literal_f32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
@@ -181,6 +269,9 @@ define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 ; FUNC-LABEL: @fmin_literal_var_f32
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
 define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index 6c09aa242677..68ebc4dedfe0 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -42,9 +42,9 @@ entry:
 ; SI: v_mul_f32
 ; SI: v_mul_f32
 define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1) * %in
-  %b = load <4 x float> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1) * %in
+  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
   %result = fmul <4 x float> %a, %b
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/fmul64.ll b/test/CodeGen/R600/fmul64.ll
index 9d7787ccbe1f..3c222eaba89d 100644
--- a/test/CodeGen/R600/fmul64.ll
+++ b/test/CodeGen/R600/fmul64.ll
@@ -5,8 +5,8 @@
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
    %r2 = fmul double %r0, %r1
    store double %r2, double addrspace(1)* %out
    ret void
@@ -17,8 +17,8 @@ define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) {
-   %r0 = load <2 x double> addrspace(1)* %in1
-   %r1 = load <2 x double> addrspace(1)* %in2
+   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
    %r2 = fmul <2 x double> %r0, %r1
    store <2 x double> %r2, <2 x double> addrspace(1)* %out
    ret void
@@ -31,8 +31,8 @@ define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                         <4 x double> addrspace(1)* %in2) {
-   %r0 = load <4 x double> addrspace(1)* %in1
-   %r1 = load <4 x double> addrspace(1)* %in2
+   %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
    %r2 = fmul <4 x double> %r0, %r1
    store <4 x double> %r2, <4 x double> addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/R600/fmuladd.ll b/test/CodeGen/R600/fmuladd.ll
index 2b708639b122..ae84d841021d 100644
--- a/test/CodeGen/R600/fmuladd.ll
+++ b/test/CodeGen/R600/fmuladd.ll
@@ -10,9 +10,9 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 
 define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                          float addrspace(1)* %in2, float addrspace(1)* %in3) {
-   %r0 = load float addrspace(1)* %in1
-   %r1 = load float addrspace(1)* %in2
-   %r2 = load float addrspace(1)* %in3
+   %r0 = load float, float addrspace(1)* %in1
+   %r1 = load float, float addrspace(1)* %in2
+   %r2 = load float, float addrspace(1)* %in3
    %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
    store float %r3, float addrspace(1)* %out
    ret void
@@ -23,9 +23,9 @@ define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 
 define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                          double addrspace(1)* %in2, double addrspace(1)* %in3) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
-   %r2 = load double addrspace(1)* %in3
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = load double, double addrspace(1)* %in3
    %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2)
    store double %r3, double addrspace(1)* %out
    ret void
@@ -38,12 +38,12 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float addrspace(1)* %gep.0
-  %r2 = load float addrspace(1)* %gep.1
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
 
   %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
   store float %r3, float addrspace(1)* %gep.out
@@ -57,12 +57,12 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float addrspace(1)* %gep.0
-  %r2 = load float addrspace(1)* %gep.1
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
 
   %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
   store float %r3, float addrspace(1)* %gep.out
@@ -78,12 +78,12 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r0 = load float addrspace(1)* %gep.0
-  %r1 = load float addrspace(1)* %gep.1
+  %r0 = load float, float addrspace(1)* %gep.0
+  %r1 = load float, float addrspace(1)* %gep.1
 
   %add.0 = fadd float %r0, %r0
   %add.1 = fadd float %add.0, %r1
@@ -100,12 +100,12 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r0 = load float addrspace(1)* %gep.0
-  %r1 = load float addrspace(1)* %gep.1
+  %r0 = load float, float addrspace(1)* %gep.0
+  %r1 = load float, float addrspace(1)* %gep.1
 
   %add.0 = fadd float %r0, %r0
   %add.1 = fadd float %r1, %add.0
@@ -120,12 +120,12 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float addrspace(1)* %gep.0
-  %r2 = load float addrspace(1)* %gep.1
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
 
   %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
   store float %r3, float addrspace(1)* %gep.out
@@ -140,12 +140,12 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float addrspace(1)* %gep.0
-  %r2 = load float addrspace(1)* %gep.1
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
 
   %r1.fneg = fsub float -0.000000e+00, %r1
 
@@ -162,12 +162,12 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float addrspace(1)* %gep.0
-  %r2 = load float addrspace(1)* %gep.1
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
 
   %r1.fneg = fsub float -0.000000e+00, %r1
 
@@ -184,12 +184,12 @@ define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1
 ; CHECK: buffer_store_dword [[RESULT]]
 define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float addrspace(1)* %gep.0
-  %r2 = load float addrspace(1)* %gep.1
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
 
   %r2.fneg = fsub float -0.000000e+00, %r2
 
diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll
index 7430e7ffb33d..8830e8273661 100644
--- a/test/CodeGen/R600/fneg-fabs.f64.ll
+++ b/test/CodeGen/R600/fneg-fabs.f64.ll
@@ -5,9 +5,7 @@
 ; into 2 modifiers, although theoretically that should work.
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x7fffffff
-; SI: v_and_b32_e32 v[[FABS:[0-9]+]], {{s[0-9]+}}, [[IMMREG]]
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+}}:[[FABS]]{{\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
 define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -17,8 +15,8 @@ define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y)
 }
 
 define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
-  %x = load double addrspace(1)* %xptr, align 8
-  %y = load double addrspace(1)* %xptr, align 8
+  %x = load double, double addrspace(1)* %xptr, align 8
+  %y = load double, double addrspace(1)* %xptr, align 8
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
   %fadd = fadd double %y, %fsub
@@ -57,8 +55,8 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
 }
 
 ; FUNC-LABEL: {{^}}fneg_fabs_f64:
-; SI: s_load_dwordx2
 ; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
+; SI: s_load_dwordx2
 ; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
 ; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
 ; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
diff --git a/test/CodeGen/R600/fneg-fabs.ll b/test/CodeGen/R600/fneg-fabs.ll
index 4fde0484567c..3b4930d9897d 100644
--- a/test/CodeGen/R600/fneg-fabs.ll
+++ b/test/CodeGen/R600/fneg-fabs.ll
@@ -72,7 +72,7 @@ define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
 ; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %val = load float addrspace(1)* %in, align 4
+  %val = load float, float addrspace(1)* %in, align 4
   %fabs = call float @llvm.fabs.f32(float %val)
   %fsub = fsub float -0.000000e+00, %fabs
   store float %fsub, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/fneg.f64.ll b/test/CodeGen/R600/fneg.f64.ll
index eb2eb08b88b1..aa6df209035b 100644
--- a/test/CodeGen/R600/fneg.f64.ll
+++ b/test/CodeGen/R600/fneg.f64.ll
@@ -1,7 +1,8 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fneg_f64:
-; SI: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_f64(double addrspace(1)* %out, double %in) {
   %fneg = fsub double -0.000000e+00, %in
   store double %fneg, double addrspace(1)* %out
@@ -9,8 +10,8 @@ define void @fneg_f64(double addrspace(1)* %out, double %in) {
 }
 
 ; FUNC-LABEL: {{^}}fneg_v2f64:
-; SI: v_xor_b32
-; SI: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
   %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
   store <2 x double> %fneg, <2 x double> addrspace(1)* %out
@@ -23,10 +24,10 @@ define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double>
 ; R600: -PV
 ; R600: -PV
 
-; SI: v_xor_b32
-; SI: v_xor_b32
-; SI: v_xor_b32
-; SI: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
   %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
   store <4 x double> %fneg, <4 x double> addrspace(1)* %out
@@ -38,8 +39,7 @@ define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double>
 ; unless the target returns true for isNegFree()
 
 ; FUNC-LABEL: {{^}}fneg_free_f64:
-; FIXME: Unnecessary copy to VGPRs
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]$}}
+; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}}
 define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fsub = fsub double 0.0, %bc
@@ -47,10 +47,11 @@ define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
   ret void
 }
 
-; SI-LABEL: {{^}}fneg_fold_f64:
+; GCN-LABEL: {{^}}fneg_fold_f64:
 ; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-NOT: xor
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-NOT: xor
+; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
 define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
   %fsub = fsub double -0.0, %in
   %fmul = fmul double %fsub, %in
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
index ca3350dd7f48..a0fd539863c6 100644
--- a/test/CodeGen/R600/fneg.ll
+++ b/test/CodeGen/R600/fneg.ll
@@ -1,10 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fneg_f32:
 ; R600: -PV
 
-; SI: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_f32(float addrspace(1)* %out, float %in) {
   %fneg = fsub float -0.000000e+00, %in
   store float %fneg, float addrspace(1)* %out
@@ -15,8 +16,8 @@ define void @fneg_f32(float addrspace(1)* %out, float %in) {
 ; R600: -PV
 ; R600: -PV
 
-; SI: v_xor_b32
-; SI: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
   %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
   store <2 x float> %fneg, <2 x float> addrspace(1)* %out
@@ -29,10 +30,10 @@ define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %i
 ; R600: -PV
 ; R600: -PV
 
-; SI: v_xor_b32
-; SI: v_xor_b32
-; SI: v_xor_b32
-; SI: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
 define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
   %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
   store <4 x float> %fneg, <4 x float> addrspace(1)* %out
@@ -48,7 +49,7 @@ define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %i
 ; R600: -KC0[2].Z
 
 ; XXX: We could use v_add_f32_e64 with the negate bit here instead.
-; SI: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}}
+; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}}
 define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float 0.0, %bc
@@ -58,8 +59,9 @@ define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
 
 ; FUNC-LABEL: {{^}}fneg_fold_f32:
 ; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; SI-NOT: xor
-; SI: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: xor
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
 define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
   %fsub = fsub float -0.0, %in
   %fmul = fmul float %fsub, %in
diff --git a/test/CodeGen/R600/fp-classify.ll b/test/CodeGen/R600/fp-classify.ll
index c1de85203104..4fac5176fac9 100644
--- a/test/CodeGen/R600/fp-classify.ll
+++ b/test/CodeGen/R600/fp-classify.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=r600 -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
 declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
diff --git a/test/CodeGen/R600/fp16_to_fp.ll b/test/CodeGen/R600/fp16_to_fp.ll
index da78f6155c85..5a79ca82bc29 100644
--- a/test/CodeGen/R600/fp16_to_fp.ll
+++ b/test/CodeGen/R600/fp16_to_fp.ll
@@ -9,7 +9,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
 ; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[RESULT]]
 define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16 addrspace(1)* %in, align 2
+  %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
   ret void
@@ -22,7 +22,7 @@ define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 add
 ; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16 addrspace(1)* %in, align 2
+  %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
   store double %cvt, double addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/fp32_to_fp16.ll b/test/CodeGen/R600/fp32_to_fp16.ll
index c3c65aece082..67925ebd82b6 100644
--- a/test/CodeGen/R600/fp32_to_fp16.ll
+++ b/test/CodeGen/R600/fp32_to_fp16.ll
@@ -8,7 +8,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_short [[RESULT]]
 define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %val = load float addrspace(1)* %in, align 4
+  %val = load float, float addrspace(1)* %in, align 4
   %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
   store i16 %cvt, i16 addrspace(1)* %out, align 2
   ret void
diff --git a/test/CodeGen/R600/fp_to_sint.f64.ll b/test/CodeGen/R600/fp_to_sint.f64.ll
index e6418477a9b4..12df6606e8ff 100644
--- a/test/CodeGen/R600/fp_to_sint.f64.ll
+++ b/test/CodeGen/R600/fp_to_sint.f64.ll
@@ -48,8 +48,8 @@ define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double addrspace(1)* %in, i32 %tid
-  %val = load double addrspace(1)* %gep, align 8
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %val = load double, double addrspace(1)* %gep, align 8
   %cast = fptosi double %val to i64
   store i64 %cast, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 16549c392b00..301a94b4904c 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -44,7 +44,7 @@ define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
 define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %value = load <4 x float> addrspace(1) * %in
+  %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptosi <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/fp_to_uint.f64.ll b/test/CodeGen/R600/fp_to_uint.f64.ll
index 1ffe2faadf33..41bc2a780014 100644
--- a/test/CodeGen/R600/fp_to_uint.f64.ll
+++ b/test/CodeGen/R600/fp_to_uint.f64.ll
@@ -48,8 +48,8 @@ define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double addrspace(1)* %in, i32 %tid
-  %val = load double addrspace(1)* %gep, align 8
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %val = load double, double addrspace(1)* %gep, align 8
   %cast = fptoui double %val to i64
   store i64 %cast, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index 804d90f476da..b7b6ccc238b3 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -36,7 +36,7 @@ define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float>
 ; SI: v_cvt_u32_f32_e32
 
 define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %value = load <4 x float> addrspace(1) * %in
+  %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptoui <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/fpext.ll b/test/CodeGen/R600/fpext.ll
index 21c7bfd48df8..734a43be2296 100644
--- a/test/CodeGen/R600/fpext.ll
+++ b/test/CodeGen/R600/fpext.ll
@@ -1,10 +1,45 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; CHECK: {{^}}fpext:
-; CHECK: v_cvt_f64_f32_e32
-define void @fpext(double addrspace(1)* %out, float %in) {
+; FUNC-LABEL: {{^}}fpext_f32_to_f64:
+; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
   %result = fpext float %in to double
   store double %result, double addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
+  %result = fpext <2 x float> %in to <2 x double>
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
+  %result = fpext <4 x float> %in to <4 x double>
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v8f32_to_v8f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
+  %result = fpext <8 x float> %in to <8 x double>
+  store <8 x double> %result, <8 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fptrunc.ll b/test/CodeGen/R600/fptrunc.ll
index 94fcdab9c52f..385e10e7baae 100644
--- a/test/CodeGen/R600/fptrunc.ll
+++ b/test/CodeGen/R600/fptrunc.ll
@@ -1,10 +1,45 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; CHECK: {{^}}fptrunc:
-; CHECK: v_cvt_f32_f64_e32
-define void @fptrunc(float addrspace(1)* %out, double %in) {
+; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
+; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to float
   store float %result, float addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
+  %result = fptrunc <2 x double> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
+  %result = fptrunc <4 x double> %in to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
+  %result = fptrunc <8 x double> %in to <8 x float>
+  store <8 x float> %result, <8 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/frem.ll b/test/CodeGen/R600/frem.ll
index 564634178656..f245ef08cb9d 100644
--- a/test/CodeGen/R600/frem.ll
+++ b/test/CodeGen/R600/frem.ll
@@ -1,73 +1,78 @@
-; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}frem_f32:
-; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
-; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
-; SI-DAG: v_cmp
-; SI-DAG: v_mul_f32
-; SI: v_rcp_f32_e32
-; SI: v_mul_f32_e32
-; SI: v_mul_f32_e32
-; SI: v_trunc_f32_e32
-; SI: v_mad_f32
-; SI: s_endpgm
+; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
+; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
+; GCN-DAG: v_cmp
+; GCN-DAG: v_mul_f32
+; GCN: v_rcp_f32_e32
+; GCN: v_mul_f32_e32
+; GCN: v_mul_f32_e32
+; GCN: v_trunc_f32_e32
+; GCN: v_mad_f32
+; GCN: s_endpgm
 define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                       float addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr float addrspace(1)* %in2, i32 4
-   %r0 = load float addrspace(1)* %in1, align 4
-   %r1 = load float addrspace(1)* %gep2, align 4
+   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
+   %r0 = load float, float addrspace(1)* %in1, align 4
+   %r1 = load float, float addrspace(1)* %gep2, align 4
    %r2 = frem float %r0, %r1
    store float %r2, float addrspace(1)* %out, align 4
    ret void
 }
 
 ; FUNC-LABEL: {{^}}unsafe_frem_f32:
-; SI: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
-; SI: buffer_load_dword [[X:v[0-9]+]], {{.*}}
-; SI: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
-; SI: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
-; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
-; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
+; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
+; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
+; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
+; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
+; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
+; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_endpgm
 define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                              float addrspace(1)* %in2) #1 {
-   %gep2 = getelementptr float addrspace(1)* %in2, i32 4
-   %r0 = load float addrspace(1)* %in1, align 4
-   %r1 = load float addrspace(1)* %gep2, align 4
+   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
+   %r0 = load float, float addrspace(1)* %in1, align 4
+   %r1 = load float, float addrspace(1)* %gep2, align 4
    %r2 = frem float %r0, %r1
    store float %r2, float addrspace(1)* %out, align 4
    ret void
 }
 
 ; FUNC-LABEL: {{^}}frem_f64:
-; SI: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
-; SI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
-; SI-DAG: v_div_fmas_f64
-; SI-DAG: v_div_scale_f64
-; SI-DAG: v_mul_f64
-; SI: v_add_f64
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
+; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
+; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
+; GCN-DAG: v_div_fmas_f64
+; GCN-DAG: v_div_scale_f64
+; GCN-DAG: v_mul_f64
+; CI: v_trunc_f64_e32
+; CI: v_mul_f64
+; GCN: v_add_f64
+; GCN: buffer_store_dwordx2
+; GCN: s_endpgm
 define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) #0 {
-   %r0 = load double addrspace(1)* %in1, align 8
-   %r1 = load double addrspace(1)* %in2, align 8
+   %r0 = load double, double addrspace(1)* %in1, align 8
+   %r1 = load double, double addrspace(1)* %in2, align 8
    %r2 = frem double %r0, %r1
    store double %r2, double addrspace(1)* %out, align 8
    ret void
 }
 
 ; FUNC-LABEL: {{^}}unsafe_frem_f64:
-; SI: v_rcp_f64_e32
-; SI: v_mul_f64
+; GCN: v_rcp_f64_e32
+; GCN: v_mul_f64
 ; SI: v_bfe_u32
-; SI: v_fma_f64
-; SI: s_endpgm
+; CI: v_trunc_f64_e32
+; GCN: v_fma_f64
+; GCN: s_endpgm
 define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                              double addrspace(1)* %in2) #1 {
-   %r0 = load double addrspace(1)* %in1, align 8
-   %r1 = load double addrspace(1)* %in2, align 8
+   %r0 = load double, double addrspace(1)* %in1, align 8
+   %r1 = load double, double addrspace(1)* %in2, align 8
    %r2 = frem double %r0, %r1
    store double %r2, double addrspace(1)* %out, align 8
    ret void
@@ -75,9 +80,9 @@ define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in
 
 define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                         <2 x float> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <2 x float> addrspace(1)* %in2, i32 4
-   %r0 = load <2 x float> addrspace(1)* %in1, align 8
-   %r1 = load <2 x float> addrspace(1)* %gep2, align 8
+   %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
+   %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
    %r2 = frem <2 x float> %r0, %r1
    store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
    ret void
@@ -85,9 +90,9 @@ define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                         <4 x float> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <4 x float> addrspace(1)* %in2, i32 4
-   %r0 = load <4 x float> addrspace(1)* %in1, align 16
-   %r1 = load <4 x float> addrspace(1)* %gep2, align 16
+   %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
+   %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
    %r2 = frem <4 x float> %r0, %r1
    store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
    ret void
@@ -95,9 +100,9 @@ define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 
 define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <2 x double> addrspace(1)* %in2, i32 4
-   %r0 = load <2 x double> addrspace(1)* %in1, align 16
-   %r1 = load <2 x double> addrspace(1)* %gep2, align 16
+   %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
+   %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
    %r2 = frem <2 x double> %r0, %r1
    store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
    ret void
diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll
index 1fdf3e453bf3..04101346cdf9 100644
--- a/test/CodeGen/R600/fsqrt.ll
+++ b/test/CodeGen/R600/fsqrt.ll
@@ -9,7 +9,7 @@
 ; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
 
 define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-   %r0 = load float addrspace(1)* %in
+   %r0 = load float, float addrspace(1)* %in
    %r1 = call float @llvm.sqrt.f32(float %r0)
    store float %r1, float addrspace(1)* %out
    ret void
@@ -19,7 +19,7 @@ define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 ; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 
 define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-   %r0 = load double addrspace(1)* %in
+   %r0 = load double, double addrspace(1)* %in
    %r1 = call double @llvm.sqrt.f64(double %r0)
    store double %r1, double addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index ef90fea67900..dfe41cb5b111 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -6,9 +6,9 @@
 ; FUNC-LABEL: {{^}}v_fsub_f32:
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %b_ptr = getelementptr float addrspace(1)* %in, i32 1
-  %a = load float addrspace(1)* %in, align 4
-  %b = load float addrspace(1)* %b_ptr, align 4
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %a = load float, float addrspace(1)* %in, align 4
+  %b = load float, float addrspace(1)* %b_ptr, align 4
   %result = fsub float %a, %b
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -52,9 +52,9 @@ define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1)* %in, align 16
-  %b = load <4 x float> addrspace(1)* %b_ptr, align 16
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
   %result = fsub <4 x float> %a, %b
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
index 62f46142fe0d..f34a48e30a86 100644
--- a/test/CodeGen/R600/fsub64.ll
+++ b/test/CodeGen/R600/fsub64.ll
@@ -1,13 +1,107 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
+declare double @llvm.fabs.f64(double) #0
+
 ; SI-LABEL: {{^}}fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
-   %r2 = fsub double %r0, %r1
-   store double %r2, double addrspace(1)* %out
-   ret void
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r2 = fsub double %r0, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_fabs_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
+define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                           double addrspace(1)* %in2) {
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r1.fabs = call double @llvm.fabs.f64(double %r1) #0
+  %r2 = fsub double %r0, %r1.fabs
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_fabs_inv_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                               double addrspace(1)* %in2) {
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r0.fabs = call double @llvm.fabs.f64(double %r0) #0
+  %r2 = fsub double %r0.fabs, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double %a, %b
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_imm_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double 4.0, %a
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double %a, 4.0
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_self_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
+  %sub = fsub double %a, %a
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_v2f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
+  %sub = fsub <2 x double> %a, %b
+  store <2 x double> %sub, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_v4f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
+  %a = load <4 x double>, <4 x double> addrspace(1)* %in
+  %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr
+  %result = fsub <4 x double> %a, %b
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
 }
+
+; SI-LABEL: {{^}}s_fsub_v4f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
+  %result = fsub <4 x double> %a, %b
+  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/ftrunc.f64.ll b/test/CodeGen/R600/ftrunc.f64.ll
index 2c7217ef0561..6618d8b5e57e 100644
--- a/test/CodeGen/R600/ftrunc.f64.ll
+++ b/test/CodeGen/R600/ftrunc.f64.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
 declare double @llvm.trunc.f64(double) nounwind readnone
 declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
@@ -13,7 +14,7 @@ declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
 ; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
 ; SI: s_endpgm
 define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-  %x = load double addrspace(1)* %in, align 8
+  %x = load double, double addrspace(1)* %in, align 8
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out, align 8
   ret void
@@ -23,15 +24,15 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; CI: v_trunc_f64_e32
 
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
 ; SI: s_lshr_b64
 ; SI: s_not_b64
 ; SI: s_and_b64
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: cmp_lt_i32
+; SI: cmp_gt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
-; SI: cmp_gt_i32
+; SI: cmp_lt_i32
 ; SI: cndmask_b32
 ; SI: cndmask_b32
 ; SI: s_endpgm
diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
index 2d1892534dc5..471b0f6b13e7 100644
--- a/test/CodeGen/R600/gep-address-space.ll
+++ b/test/CodeGen/R600/gep-address-space.ll
@@ -1,11 +1,12 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
 define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: {{^}}use_gep_address_space:
 ; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
 ; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
-  %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16
+  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16
   store i32 99, i32 addrspace(3)* %p
   ret void
 }
@@ -17,7 +18,7 @@ define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %arra
 ; SI: s_or_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
-  %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
+  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
   store i32 99, i32 addrspace(3)* %p
   ret void
 }
@@ -28,7 +29,7 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind
 ; CHECK: s_add_i32
 ; CHECK: s_add_i32
 ; CHECK: s_add_i32
-  %p = getelementptr <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
   %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2
@@ -44,7 +45,7 @@ define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind
 ; CHECK-LABEL: {{^}}gep_as_vector_v2:
 ; CHECK: s_add_i32
 ; CHECK: s_add_i32
-  %p = getelementptr <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
+  %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
   %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
   store i32 99, i32 addrspace(3)* %p0
diff --git a/test/CodeGen/R600/global-directive.ll b/test/CodeGen/R600/global-directive.ll
index 3ba12c206ad3..be775cf9292f 100644
--- a/test/CodeGen/R600/global-directive.ll
+++ b/test/CodeGen/R600/global-directive.ll
@@ -6,9 +6,9 @@
 ; SI:	.globl	foo
 ; SI: {{^}}foo:
 define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %a = load i32 addrspace(1)* %in
-  %b = load i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = add i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/global-extload-i1.ll b/test/CodeGen/R600/global-extload-i1.ll
index 5dc494900ce8..bd9557d730fb 100644
--- a/test/CodeGen/R600/global-extload-i1.ll
+++ b/test/CodeGen/R600/global-extload-i1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; FIXME: Evergreen broken
 
@@ -8,7 +8,7 @@
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1 addrspace(1)* %in
+  %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
   ret void
@@ -20,7 +20,7 @@ define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1 addrspace(1)* %in
+  %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
   ret void
@@ -29,7 +29,7 @@ define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32:
 ; SI: s_endpgm
 define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1> addrspace(1)* %in
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1
 ; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32:
 ; SI: s_endpgm
 define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1> addrspace(1)* %in
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
   ret void
@@ -47,7 +47,7 @@ define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1
 ; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32:
 ; SI: s_endpgm
 define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1> addrspace(1)* %in
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
   ret void
@@ -56,7 +56,7 @@ define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1
 ; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32:
 ; SI: s_endpgm
 define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1> addrspace(1)* %in
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1
 ; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32:
 ; SI: s_endpgm
 define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1> addrspace(1)* %in
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
   ret void
@@ -74,7 +74,7 @@ define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1
 ; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32:
 ; SI: s_endpgm
 define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1> addrspace(1)* %in
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
   ret void
@@ -83,7 +83,7 @@ define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1
 ; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32:
 ; SI: s_endpgm
 define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1> addrspace(1)* %in
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
   ret void
@@ -92,7 +92,7 @@ define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1
 ; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32:
 ; SI: s_endpgm
 define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1> addrspace(1)* %in
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
   ret void
@@ -101,7 +101,7 @@ define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1
 ; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32:
 ; SI: s_endpgm
 define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1> addrspace(1)* %in
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
   ret void
@@ -110,7 +110,7 @@ define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32:
 ; SI: s_endpgm
 define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1> addrspace(1)* %in
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
   ret void
@@ -119,7 +119,7 @@ define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32:
 ; XSI: s_endpgm
 ; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1> addrspace(1)* %in
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
 ;   %ext = zext <32 x i1> %load to <32 x i32>
 ;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
 ;   ret void
@@ -128,7 +128,7 @@ define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32:
 ; XSI: s_endpgm
 ; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1> addrspace(1)* %in
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
 ;   %ext = sext <32 x i1> %load to <32 x i32>
 ;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
 ;   ret void
@@ -137,7 +137,7 @@ define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32:
 ; XSI: s_endpgm
 ; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1> addrspace(1)* %in
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
 ;   %ext = zext <64 x i1> %load to <64 x i32>
 ;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
 ;   ret void
@@ -146,7 +146,7 @@ define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32:
 ; XSI: s_endpgm
 ; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1> addrspace(1)* %in
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
 ;   %ext = sext <64 x i1> %load to <64 x i32>
 ;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
 ;   ret void
@@ -157,7 +157,7 @@ define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; SI: buffer_store_dwordx2
 define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1 addrspace(1)* %in
+  %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
   ret void
@@ -169,7 +169,7 @@ define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; SI: buffer_store_dwordx2
 define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %a = load i1 addrspace(1)* %in
+  %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@ define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64:
 ; SI: s_endpgm
 define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1> addrspace(1)* %in
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@ define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1
 ; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64:
 ; SI: s_endpgm
 define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i1> addrspace(1)* %in
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
   ret void
@@ -196,7 +196,7 @@ define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1
 ; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64:
 ; SI: s_endpgm
 define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1> addrspace(1)* %in
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
   ret void
@@ -205,7 +205,7 @@ define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1
 ; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64:
 ; SI: s_endpgm
 define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i1> addrspace(1)* %in
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
   ret void
@@ -214,7 +214,7 @@ define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1
 ; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64:
 ; SI: s_endpgm
 define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1> addrspace(1)* %in
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
   ret void
@@ -223,7 +223,7 @@ define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1
 ; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64:
 ; SI: s_endpgm
 define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i1> addrspace(1)* %in
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
   ret void
@@ -232,7 +232,7 @@ define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1
 ; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64:
 ; SI: s_endpgm
 define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1> addrspace(1)* %in
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
   ret void
@@ -241,7 +241,7 @@ define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1
 ; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64:
 ; SI: s_endpgm
 define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i1> addrspace(1)* %in
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
   ret void
@@ -250,7 +250,7 @@ define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1
 ; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64:
 ; SI: s_endpgm
 define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1> addrspace(1)* %in
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
   ret void
@@ -259,7 +259,7 @@ define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64:
 ; SI: s_endpgm
 define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i1> addrspace(1)* %in
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
   ret void
@@ -268,7 +268,7 @@ define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64:
 ; XSI: s_endpgm
 ; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1> addrspace(1)* %in
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
 ;   %ext = zext <32 x i1> %load to <32 x i64>
 ;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
 ;   ret void
@@ -277,7 +277,7 @@ define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64:
 ; XSI: s_endpgm
 ; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i1> addrspace(1)* %in
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
 ;   %ext = sext <32 x i1> %load to <32 x i64>
 ;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
 ;   ret void
@@ -286,7 +286,7 @@ define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64:
 ; XSI: s_endpgm
 ; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1> addrspace(1)* %in
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
 ;   %ext = zext <64 x i1> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
 ;   ret void
@@ -295,7 +295,7 @@ define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64:
 ; XSI: s_endpgm
 ; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i1> addrspace(1)* %in
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
 ;   %ext = sext <64 x i1> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
 ;   ret void
diff --git a/test/CodeGen/R600/global-extload-i16.ll b/test/CodeGen/R600/global-extload-i16.ll
index a1740ec8236a..103a40dee270 100644
--- a/test/CodeGen/R600/global-extload-i16.ll
+++ b/test/CodeGen/R600/global-extload-i16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
 
@@ -8,7 +8,7 @@
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16 addrspace(1)* %in
+  %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16 addrspace(1)* %in
+  %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
   ret void
@@ -29,7 +29,7 @@ define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; SI: buffer_load_ushort
 ; SI: s_endpgm
 define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16> addrspace(1)* %in
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
   ret void
@@ -39,7 +39,7 @@ define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; SI: buffer_load_sshort
 ; SI: s_endpgm
 define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16> addrspace(1)* %in
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
 define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16> addrspace(1)* %in
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
 define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16> addrspace(1)* %in
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
 define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16> addrspace(1)* %in
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
   ret void
@@ -75,7 +75,7 @@ define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
 define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16> addrspace(1)* %in
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
   ret void
@@ -84,7 +84,7 @@ define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
 define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16> addrspace(1)* %in
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
   ret void
@@ -93,7 +93,7 @@ define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
 define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16> addrspace(1)* %in
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
   ret void
@@ -102,7 +102,7 @@ define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
 define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16> addrspace(1)* %in
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
   ret void
@@ -111,7 +111,7 @@ define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
 define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16> addrspace(1)* %in
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
   ret void
@@ -120,7 +120,7 @@ define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
 define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16> addrspace(1)* %in
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
   ret void
@@ -129,7 +129,7 @@ define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
 define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16> addrspace(1)* %in
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
   ret void
@@ -138,7 +138,7 @@ define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
 define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16> addrspace(1)* %in
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
   ret void
@@ -147,7 +147,7 @@ define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
 define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16> addrspace(1)* %in
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@ define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
 define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16 addrspace(1)* %in
+  %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
   ret void
@@ -169,7 +169,7 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
 ; SI: buffer_store_dwordx2
 define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16 addrspace(1)* %in
+  %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@ define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
 define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16> addrspace(1)* %in
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@ define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
 define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16> addrspace(1)* %in
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
   ret void
@@ -196,7 +196,7 @@ define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
 define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16> addrspace(1)* %in
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
   ret void
@@ -205,7 +205,7 @@ define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
 define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16> addrspace(1)* %in
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
   ret void
@@ -214,7 +214,7 @@ define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
 define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16> addrspace(1)* %in
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
   ret void
@@ -223,7 +223,7 @@ define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
 define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16> addrspace(1)* %in
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
   ret void
@@ -232,7 +232,7 @@ define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
 define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16> addrspace(1)* %in
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
   ret void
@@ -241,7 +241,7 @@ define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
 define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16> addrspace(1)* %in
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
   ret void
@@ -250,7 +250,7 @@ define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
 define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16> addrspace(1)* %in
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
   ret void
@@ -259,7 +259,7 @@ define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
 define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16> addrspace(1)* %in
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
   ret void
@@ -268,7 +268,7 @@ define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
 define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16> addrspace(1)* %in
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
   ret void
@@ -277,7 +277,7 @@ define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
 define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16> addrspace(1)* %in
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
   ret void
@@ -286,7 +286,7 @@ define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
 define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16> addrspace(1)* %in
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
   ret void
@@ -295,7 +295,7 @@ define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
 define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16> addrspace(1)* %in
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/global-extload-i32.ll b/test/CodeGen/R600/global-extload-i32.ll
index f56b6ac8dc38..79b83452939e 100644
--- a/test/CodeGen/R600/global-extload-i32.ll
+++ b/test/CodeGen/R600/global-extload-i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
 define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %a = load i32 addrspace(1)* %in
+  %a = load i32, i32 addrspace(1)* %in
   %ext = zext i32 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
 ; SI: buffer_store_dwordx2
 define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %a = load i32 addrspace(1)* %in
+  %a = load i32, i32 addrspace(1)* %in
   %ext = sext i32 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
   ret void
@@ -29,7 +29,7 @@ define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i32> addrspace(1)* %in
+  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = zext <1 x i32> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
   ret void
@@ -41,7 +41,7 @@ define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i32> addrspace(1)* %in
+  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = sext <1 x i32> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
   ret void
@@ -53,7 +53,7 @@ define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i32> addrspace(1)* %in
+  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = zext <2 x i32> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
   ret void
@@ -67,7 +67,7 @@ define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; SI-DAG: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i32> addrspace(1)* %in
+  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = sext <2 x i32> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i32> addrspace(1)* %in
+  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = zext <4 x i32> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; SI-DAG: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i32> addrspace(1)* %in
+  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = sext <4 x i32> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
   ret void
@@ -124,7 +124,7 @@ define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; SI-DAG: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i32> addrspace(1)* %in
+  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = zext <8 x i32> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@ define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; SI: s_endpgm
 define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i32> addrspace(1)* %in
+  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = sext <8 x i32> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
   ret void
@@ -212,7 +212,7 @@ define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; SI-DAG: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i32> addrspace(1)* %in
+  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = sext <16 x i32> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
   ret void
@@ -255,7 +255,7 @@ define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; SI: s_endpgm
 define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i32> addrspace(1)* %in
+  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = zext <16 x i32> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
   ret void
@@ -369,7 +369,7 @@ define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; SI: s_endpgm
 define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i32> addrspace(1)* %in
+  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = sext <32 x i32> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
   ret void
@@ -450,7 +450,7 @@ define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; SI: s_endpgm
 define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i32> addrspace(1)* %in
+  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = zext <32 x i32> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/global-extload-i8.ll b/test/CodeGen/R600/global-extload-i8.ll
index 86245232d3e4..b31d5361d5a2 100644
--- a/test/CodeGen/R600/global-extload-i8.ll
+++ b/test/CodeGen/R600/global-extload-i8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}zextload_global_i8_to_i32:
@@ -7,7 +7,7 @@
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8 addrspace(1)* %in
+  %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)*
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8 addrspace(1)* %in
+  %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
   ret void
@@ -27,7 +27,7 @@ define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)*
 ; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32:
 ; SI: s_endpgm
 define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8> addrspace(1)* %in
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@ define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32:
 ; SI: s_endpgm
 define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8> addrspace(1)* %in
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
   ret void
@@ -45,7 +45,7 @@ define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32:
 ; SI: s_endpgm
 define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8> addrspace(1)* %in
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@ define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32:
 ; SI: s_endpgm
 define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8> addrspace(1)* %in
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
   ret void
@@ -63,7 +63,7 @@ define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32:
 ; SI: s_endpgm
 define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8> addrspace(1)* %in
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
   ret void
@@ -72,7 +72,7 @@ define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32:
 ; SI: s_endpgm
 define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8> addrspace(1)* %in
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32:
 ; SI: s_endpgm
 define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8> addrspace(1)* %in
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
   ret void
@@ -90,7 +90,7 @@ define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32:
 ; SI: s_endpgm
 define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8> addrspace(1)* %in
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32:
 ; SI: s_endpgm
 define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8> addrspace(1)* %in
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
   ret void
@@ -108,7 +108,7 @@ define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32:
 ; SI: s_endpgm
 define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8> addrspace(1)* %in
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
   ret void
@@ -117,7 +117,7 @@ define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32:
 ; XSI: s_endpgm
 ; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8> addrspace(1)* %in
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
 ;   %ext = zext <32 x i8> %load to <32 x i32>
 ;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
 ;   ret void
@@ -126,7 +126,7 @@ define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32:
 ; XSI: s_endpgm
 ; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8> addrspace(1)* %in
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
 ;   %ext = sext <32 x i8> %load to <32 x i32>
 ;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
 ;   ret void
@@ -135,7 +135,7 @@ define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32:
 ; XSI: s_endpgm
 ; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8> addrspace(1)* %in
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i32>
 ;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
 ;   ret void
@@ -144,7 +144,7 @@ define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32:
 ; XSI: s_endpgm
 ; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8> addrspace(1)* %in
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i32>
 ;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
 ;   ret void
@@ -155,7 +155,7 @@ define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
 define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8 addrspace(1)* %in
+  %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
   ret void
@@ -166,7 +166,7 @@ define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
 ; SI: buffer_store_dwordx2
 define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %a = load i8 addrspace(1)* %in
+  %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
   ret void
@@ -175,7 +175,7 @@ define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 ; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64:
 ; SI: s_endpgm
 define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8> addrspace(1)* %in
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
   ret void
@@ -184,7 +184,7 @@ define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64:
 ; SI: s_endpgm
 define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i8> addrspace(1)* %in
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
   ret void
@@ -193,7 +193,7 @@ define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64:
 ; SI: s_endpgm
 define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8> addrspace(1)* %in
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
   ret void
@@ -202,7 +202,7 @@ define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64:
 ; SI: s_endpgm
 define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i8> addrspace(1)* %in
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
   ret void
@@ -211,7 +211,7 @@ define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64:
 ; SI: s_endpgm
 define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8> addrspace(1)* %in
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
   ret void
@@ -220,7 +220,7 @@ define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64:
 ; SI: s_endpgm
 define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i8> addrspace(1)* %in
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
   ret void
@@ -229,7 +229,7 @@ define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64:
 ; SI: s_endpgm
 define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8> addrspace(1)* %in
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
   ret void
@@ -238,7 +238,7 @@ define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64:
 ; SI: s_endpgm
 define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i8> addrspace(1)* %in
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
   ret void
@@ -247,7 +247,7 @@ define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64:
 ; SI: s_endpgm
 define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8> addrspace(1)* %in
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
   ret void
@@ -256,7 +256,7 @@ define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64:
 ; SI: s_endpgm
 define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i8> addrspace(1)* %in
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
   ret void
@@ -265,7 +265,7 @@ define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64:
 ; XSI: s_endpgm
 ; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8> addrspace(1)* %in
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
 ;   %ext = zext <32 x i8> %load to <32 x i64>
 ;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
 ;   ret void
@@ -274,7 +274,7 @@ define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64:
 ; XSI: s_endpgm
 ; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <32 x i8> addrspace(1)* %in
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
 ;   %ext = sext <32 x i8> %load to <32 x i64>
 ;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
 ;   ret void
@@ -283,7 +283,7 @@ define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64:
 ; XSI: s_endpgm
 ; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8> addrspace(1)* %in
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
 ;   ret void
@@ -292,7 +292,7 @@ define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64:
 ; XSI: s_endpgm
 ; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-;   %load = load <64 x i8> addrspace(1)* %in
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
 ;   ret void
diff --git a/test/CodeGen/R600/global-zero-initializer.ll b/test/CodeGen/R600/global-zero-initializer.ll
index 6909c58354c5..45aa8bf4e1d7 100644
--- a/test/CodeGen/R600/global-zero-initializer.ll
+++ b/test/CodeGen/R600/global-zero-initializer.ll
@@ -6,8 +6,8 @@
 @lds = addrspace(1) global [256 x i32] zeroinitializer
 
 define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) {
- %gep = getelementptr [256 x i32] addrspace(1)* @lds, i32 0, i32 10
-  %ld = load i32 addrspace(1)* %gep
+ %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10
+  %ld = load i32, i32 addrspace(1)* %gep
   store i32 %ld, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/global_atomics.ll b/test/CodeGen/R600/global_atomics.ll
index 5a07a028f44f..847950f6376e 100644
--- a/test/CodeGen/R600/global_atomics.ll
+++ b/test/CodeGen/R600/global_atomics.ll
@@ -4,7 +4,7 @@
 ; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -14,7 +14,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -24,8 +24,8 @@ entry:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -35,8 +35,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -64,7 +64,7 @@ entry:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -74,7 +74,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -84,7 +84,7 @@ entry:
 ; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -94,7 +94,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -104,8 +104,8 @@ entry:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -115,8 +115,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -144,7 +144,7 @@ entry:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -154,7 +154,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -164,7 +164,7 @@ entry:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -174,7 +174,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -184,8 +184,8 @@ entry:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -195,8 +195,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -224,7 +224,7 @@ entry:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -234,7 +234,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -244,7 +244,7 @@ entry:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -254,7 +254,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -264,8 +264,8 @@ entry:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -275,8 +275,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -304,7 +304,7 @@ entry:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -314,7 +314,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -324,7 +324,7 @@ entry:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -334,7 +334,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -344,8 +344,8 @@ entry:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -355,8 +355,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -384,7 +384,7 @@ entry:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -394,7 +394,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -404,7 +404,7 @@ entry:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -414,7 +414,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -424,8 +424,8 @@ entry:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -435,8 +435,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -464,7 +464,7 @@ entry:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -474,7 +474,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -484,7 +484,7 @@ entry:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -494,7 +494,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -504,8 +504,8 @@ entry:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -515,8 +515,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -544,7 +544,7 @@ entry:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -554,7 +554,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -564,7 +564,7 @@ entry:
 ; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -574,7 +574,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -584,8 +584,8 @@ entry:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -595,8 +595,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -624,7 +624,7 @@ entry:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -634,7 +634,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -644,7 +644,7 @@ entry:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -654,7 +654,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -664,8 +664,8 @@ entry:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -675,8 +675,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -704,7 +704,7 @@ entry:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -714,7 +714,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -724,7 +724,7 @@ entry:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -734,7 +734,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
-  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
   %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -744,8 +744,8 @@ entry:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
   ret void
 }
@@ -755,8 +755,8 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
@@ -784,7 +784,7 @@ entry:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
   ret void
 }
@@ -794,7 +794,7 @@ entry:
 ; SI: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
-  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
   store i32 %0, i32 addrspace(1)* %out2
   ret void
diff --git a/test/CodeGen/R600/gv-const-addrspace-fail.ll b/test/CodeGen/R600/gv-const-addrspace-fail.ll
index af0df413ca58..014b0a5482ab 100644
--- a/test/CodeGen/R600/gv-const-addrspace-fail.ll
+++ b/test/CodeGen/R600/gv-const-addrspace-fail.ll
@@ -9,8 +9,8 @@
 ; SI: buffer_store_byte
 ; SI: s_endpgm
 define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
-  %arrayidx = getelementptr inbounds [1 x i8] addrspace(2)* @a, i32 0, i32 %s
-  %1 = load i8 addrspace(2)* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s
+  %1 = load i8, i8 addrspace(2)* %arrayidx, align 1
   store i8 %1, i8 addrspace(1)* %out
   ret void
 }
@@ -22,8 +22,8 @@ define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
 ; SI: buffer_store_short
 ; SI: s_endpgm
 define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
-  %arrayidx = getelementptr inbounds [1 x i16] addrspace(2)* @b, i32 0, i32 %s
-  %1 = load i16 addrspace(2)* %arrayidx, align 2
+  %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s
+  %1 = load i16, i16 addrspace(2)* %arrayidx, align 2
   store i16 %1, i16 addrspace(1)* %out
   ret void
 }
@@ -35,8 +35,8 @@ define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
 
 ; FUNC-LABEL: {{^}}struct_bar_gv_load:
 define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
-  %load = load i8 addrspace(2)* %gep, align 1
+  %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i8, i8 addrspace(2)* %gep, align 1
   store i8 %load, i8 addrspace(1)* %out, align 1
   ret void
 }
@@ -50,8 +50,8 @@ define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
 
 ; FUNC-LABEL: {{^}}array_vector_gv_load:
 define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
-  %load = load <4 x i32> addrspace(2)* %gep, align 16
+  %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
+  %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16
   store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
index c58e5846d98c..3c1fc6c98f74 100644
--- a/test/CodeGen/R600/gv-const-addrspace.ll
+++ b/test/CodeGen/R600/gv-const-addrspace.ll
@@ -1,5 +1,6 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 
 @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
@@ -9,6 +10,7 @@
 ; FUNC-LABEL: {{^}}float:
 ; FIXME: We should be using s_load_dword here.
 ; SI: buffer_load_dword
+; VI: s_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -19,8 +21,8 @@
 
 define void @float(float addrspace(1)* %out, i32 %index) {
 entry:
-  %0 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
-  %1 = load float addrspace(2)* %0
+  %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+  %1 = load float, float addrspace(2)* %0
   store float %1, float addrspace(1)* %out
   ret void
 }
@@ -31,6 +33,7 @@ entry:
 
 ; FIXME: We should be using s_load_dword here.
 ; SI: buffer_load_dword
+; VI: s_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -41,8 +44,8 @@ entry:
 
 define void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
-  %0 = getelementptr inbounds [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
-  %1 = load i32 addrspace(2)* %0
+  %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
+  %1 = load i32, i32 addrspace(2)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
@@ -53,11 +56,11 @@ entry:
 @struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
 
 ; FUNC-LABEL: {{^}}struct_foo_gv_load:
-; SI: s_load_dword
+; GCN: s_load_dword
 
 define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
-  %load = load i32 addrspace(2)* %gep, align 4
+  %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i32, i32 addrspace(2)* %gep, align 4
   store i32 %load, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -70,9 +73,10 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
 ; FUNC-LABEL: {{^}}array_v1_gv_load:
 ; FIXME: We should be using s_load_dword here.
 ; SI: buffer_load_dword
+; VI: s_load_dword
 define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
-  %load = load <1 x i32> addrspace(2)* %gep, align 4
+  %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
+  %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
   store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
   ret void
 }
@@ -83,8 +87,8 @@ entry:
   br i1 %0, label %if, label %else
 
 if:
-  %1 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
-  %2 = load float addrspace(2)* %1
+  %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+  %2 = load float, float addrspace(2)* %1
   store float %2, float addrspace(1)* %out
   br label %endif
 
diff --git a/test/CodeGen/R600/half.ll b/test/CodeGen/R600/half.ll
index 35a41c5cd0b0..42ee788e88d5 100644
--- a/test/CodeGen/R600/half.ll
+++ b/test/CodeGen/R600/half.ll
@@ -5,7 +5,7 @@ define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {
 ; CHECK-LABEL: {{^}}test_load_store:
 ; CHECK: buffer_load_ushort [[TMP:v[0-9]+]]
 ; CHECK: buffer_store_short [[TMP]]
-  %val = load half addrspace(1)* %in
+  %val = load half, half addrspace(1)* %in
   store half %val, half addrspace(1) * %out
   ret void
 }
@@ -14,7 +14,7 @@ define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %o
 ; CHECK-LABEL: {{^}}test_bitcast_from_half:
 ; CHECK: buffer_load_ushort [[TMP:v[0-9]+]]
 ; CHECK: buffer_store_short [[TMP]]
-  %val = load half addrspace(1) * %in
+  %val = load half, half addrspace(1) * %in
   %val_int = bitcast half %val to i16
   store i16 %val_int, i16 addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in
 ; CHECK-LABEL: {{^}}test_bitcast_to_half:
 ; CHECK: buffer_load_ushort [[TMP:v[0-9]+]]
 ; CHECK: buffer_store_short [[TMP]]
-  %val = load i16 addrspace(1)* %in
+  %val = load i16, i16 addrspace(1)* %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, half addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) {
 ; CHECK-LABEL: {{^}}test_extend32:
 ; CHECK: v_cvt_f32_f16_e32
 
-  %val16 = load half addrspace(1)* %in
+  %val16 = load half, half addrspace(1)* %in
   %val32 = fpext half %val16 to float
   store float %val32, float addrspace(1)* %out
   ret void
@@ -45,7 +45,7 @@ define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) {
 ; CHECK: v_cvt_f32_f16_e32
 ; CHECK: v_cvt_f64_f32_e32
 
-  %val16 = load half addrspace(1)* %in
+  %val16 = load half, half addrspace(1)* %in
   %val64 = fpext half %val16 to double
   store double %val64, double addrspace(1)* %out
   ret void
@@ -55,7 +55,7 @@ define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) {
 ; CHECK-LABEL: {{^}}test_trunc32:
 ; CHECK: v_cvt_f16_f32_e32
 
-  %val32 = load float addrspace(1)* %in
+  %val32 = load float, float addrspace(1)* %in
   %val16 = fptrunc float %val32 to half
   store half %val16, half addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/hsa.ll b/test/CodeGen/R600/hsa.ll
index 5ce3beaa16c0..f9113399afe8 100644
--- a/test/CodeGen/R600/hsa.ll
+++ b/test/CodeGen/R600/hsa.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -mtriple=r600--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
 
-; HSA: {{^}}simple:
 ; HSA: .section        .hsa.version
 ; HSA-NEXT: .ascii  "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0"
+; HSA: {{^}}simple:
 ; Make sure we are setting the ATC bit:
 ; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000
 ; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0
diff --git a/test/CodeGen/R600/i1-copy-phi.ll b/test/CodeGen/R600/i1-copy-phi.ll
index 430466e9f80e..105cd06b330a 100644
--- a/test/CodeGen/R600/i1-copy-phi.ll
+++ b/test/CodeGen/R600/i1-copy-phi.ll
@@ -6,7 +6,7 @@
 ; SI: s_and_saveexec_b64
 ; SI: s_xor_b64
 ; SI: v_mov_b32_e32 [[REG]], -1{{$}}
-; SI: v_cmp_ne_i32_e64 {{s\[[0-9]+:[0-9]+\]}}, [[REG]], 0
+; SI: v_cmp_ne_i32_e32 vcc, 0, [[REG]]
 ; SI: s_and_saveexec_b64
 ; SI: s_xor_b64
 ; SI: s_endpgm
diff --git a/test/CodeGen/R600/i8-to-double-to-float.ll b/test/CodeGen/R600/i8-to-double-to-float.ll
index 604746627666..c218e1918bb0 100644
--- a/test/CodeGen/R600/i8-to-double-to-float.ll
+++ b/test/CodeGen/R600/i8-to-double-to-float.ll
@@ -3,7 +3,7 @@
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %1 = load i8 addrspace(1)* %in
+  %1 = load i8, i8 addrspace(1)* %in
   %2 = uitofp i8 %1 to double
   %3 = fptrunc double %2 to float
   store float %3, float addrspace(1)* %out
diff --git a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
index 71705a64f50e..60e59a5a5286 100644
--- a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
+++ b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
@@ -8,9 +8,9 @@
 
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %0 = load i32 addrspace(1)* %in
-  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %in, i32 1
-  %1 = load i32 addrspace(1)* %arrayidx1
+  %0 = load i32, i32 addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx1
   %cmp = icmp eq i32 %0, %1
   %value = select i1 %cmp, i32 0, i32 -1
   store i32 %value, i32 addrspace(1)* %out
diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
index 6e4fa3cc60cb..8917cd6dba33 100644
--- a/test/CodeGen/R600/imm.ll
+++ b/test/CodeGen/R600/imm.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CHECK %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
 ; CHECK-LABEL: {{^}}i64_imm_inline_lo:
@@ -22,81 +23,100 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+  store i64 -9223372036854775808, i64 addrspace(1) *%out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
 ; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+  store i32 -2147483648, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
   store float 0.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32
+; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
   store float -0.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
   store float 0.5, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
   store float -0.5, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
   store float 1.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
   store float -1.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
   store float 2.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
   store float -2.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
   store float 4.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
   store float -4.0, float addrspace(1)* %out
   ret void
@@ -104,288 +124,298 @@ define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
 
 ; CHECK-LABEL: {{^}}store_literal_imm_f32:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @store_literal_imm_f32(float addrspace(1)* %out) {
   store float 4096.0, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -0.5
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 1.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -1.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 2.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -2.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 4.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -4.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: @commute_add_inline_imm_0.5_f32
+; CHECK-LABEL: {{^}}commute_add_inline_imm_0.5_f32:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %x = load float addrspace(1)* %in
+  %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: @commute_add_literal_f32
+; CHECK-LABEL: {{^}}commute_add_literal_f32:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %x = load float addrspace(1)* %in
+  %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 1024.0
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_1_f32
+; CHECK-LABEL: {{^}}add_inline_imm_1_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36a0000000000000
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_2_f32
+; CHECK-LABEL: {{^}}add_inline_imm_2_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36b0000000000000
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_16_f32
+; CHECK-LABEL: {{^}}add_inline_imm_16_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 16, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36e0000000000000
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffe0000000
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffc0000000
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32
+; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -16, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xfffffffe00000000
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_63_f32
+; CHECK-LABEL: {{^}}add_inline_imm_63_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 63, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36ff800000000000
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_64_f32
+; CHECK-LABEL: {{^}}add_inline_imm_64_f32:
 ; CHECK: s_load_dword [[VAL:s[0-9]+]]
 ; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 64, [[VAL]]
-; CHECK-NEXT: buffer_store_dword [[REG]]
+; CHECK: buffer_store_dword [[REG]]
 define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x3700000000000000
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.5
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -0.5
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 1.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -1.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 2.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -2.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 4.0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -4.0
   store double %y, double addrspace(1)* %out
@@ -393,80 +423,88 @@ define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
 }
 
 
-; CHECK-LABEL: {{^}}add_inline_imm_1_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_1_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000001
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_2_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_2_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000002
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_16_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_16_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000010
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xffffffffffffffff
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffffe
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffff0
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_63_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_63_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x000000000000003F
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: {{^}}add_inline_imm_64_f64
-; CHECK: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; CHECK-LABEL: {{^}}add_inline_imm_64_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]]
-; CHECK-NEXT: buffer_store_dwordx2 [[REG]]
+; CHECK: buffer_store_dwordx2 [[REG]]
 define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000040
   store double %y, double addrspace(1)* %out
@@ -474,7 +512,7 @@ define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
 }
 
 
-; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64:
 ; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
 ; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
@@ -482,3 +520,98 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
   store double 0.0, double addrspace(1)* %out
   ret void
 }
+
+
+; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+  store double -0.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+  store double 0.5, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+  store double -0.5, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+  store double 1.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+  store double -1.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+  store double 2.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+  store double -2.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+  store double 4.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+  store double -4.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_literal_imm_f64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_literal_imm_f64(double addrspace(1)* %out) {
+  store double 4096.0, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll
index cb06d609da49..d63e1b6c5212 100644
--- a/test/CodeGen/R600/indirect-private-64.ll
+++ b/test/CodeGen/R600/indirect-private-64.ll
@@ -14,12 +14,12 @@ declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
-  %val = load double addrspace(1)* %in, align 8
+  %val = load double, double addrspace(1)* %in, align 8
   %array = alloca double, i32 16, align 8
-  %ptr = getelementptr double* %array, i32 %b
+  %ptr = getelementptr double, double* %array, i32 %b
   store double %val, double* %ptr, align 8
   call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
-  %result = load double* %ptr, align 8
+  %result = load double, double* %ptr, align 8
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
@@ -38,12 +38,12 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 ; SI-PROMOTE: ds_read_b32
 ; SI-PROMOTE: ds_read_b32
 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
-  %val = load <2 x double> addrspace(1)* %in, align 16
+  %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
   %array = alloca <2 x double>, i32 16, align 16
-  %ptr = getelementptr <2 x double>* %array, i32 %b
+  %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
   store <2 x double> %val, <2 x double>* %ptr, align 16
   call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
-  %result = load <2 x double>* %ptr, align 16
+  %result = load <2 x double>, <2 x double>* %ptr, align 16
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
   ret void
 }
@@ -56,12 +56,12 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
-  %val = load i64 addrspace(1)* %in, align 8
+  %val = load i64, i64 addrspace(1)* %in, align 8
   %array = alloca i64, i32 16, align 8
-  %ptr = getelementptr i64* %array, i32 %b
+  %ptr = getelementptr i64, i64* %array, i32 %b
   store i64 %val, i64* %ptr, align 8
   call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
-  %result = load i64* %ptr, align 8
+  %result = load i64, i64* %ptr, align 8
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -80,12 +80,12 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 ; SI-PROMOTE: ds_read_b32
 ; SI-PROMOTE: ds_read_b32
 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
-  %val = load <2 x i64> addrspace(1)* %in, align 16
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %array = alloca <2 x i64>, i32 16, align 16
-  %ptr = getelementptr <2 x i64>* %array, i32 %b
+  %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
   store <2 x i64> %val, <2 x i64>* %ptr, align 16
   call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
-  %result = load <2 x i64>* %ptr, align 16
+  %result = load <2 x i64>, <2 x i64>* %ptr, align 16
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
diff --git a/test/CodeGen/R600/inline-asm.ll b/test/CodeGen/R600/inline-asm.ll
index 37e4486db380..efc2292de3a5 100644
--- a/test/CodeGen/R600/inline-asm.ll
+++ b/test/CodeGen/R600/inline-asm.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; CHECK: {{^}}inline_asm:
 ; CHECK: s_endpgm
diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
index 64afddcca21d..6de3d408c486 100644
--- a/test/CodeGen/R600/insert_vector_elt.ll
+++ b/test/CodeGen/R600/insert_vector_elt.ll
@@ -185,13 +185,13 @@ entry:
   br i1 %1, label %if, label %else
 
 if:
-  %2 = load i32 addrspace(1)* %in
+  %2 = load i32, i32 addrspace(1)* %in
   %3 = insertelement <2 x i32> %0, i32 %2, i32 1
   br label %endif
 
 else:
-  %4 = getelementptr i32 addrspace(1)* %in, i32 1
-  %5 = load i32 addrspace(1)* %4
+  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %5 = load i32, i32 addrspace(1)* %4
   %6 = insertelement <2 x i32> %0, i32 %5, i32 1
   br label %endif
 
diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll
index a1cd3882443a..f55912e37401 100644
--- a/test/CodeGen/R600/jump-address.ll
+++ b/test/CodeGen/R600/jump-address.ll
@@ -6,7 +6,7 @@
 
 define void @main() #0 {
 main_body:
-  %0 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %1 = extractelement <4 x float> %0, i32 0
   %2 = bitcast float %1 to i32
   %3 = icmp eq i32 %2, 0
@@ -17,7 +17,7 @@ main_body:
   br i1 %7, label %ENDIF, label %ELSE
 
 ELSE:                                             ; preds = %main_body
-  %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %9 = extractelement <4 x float> %8, i32 0
   %10 = bitcast float %9 to i32
   %11 = icmp eq i32 %10, 1
@@ -40,7 +40,7 @@ ENDIF:                                            ; preds = %IF13, %ELSE, %main_
   ret void
 
 IF13:                                             ; preds = %ELSE
-  %20 = load <4 x float> addrspace(8)* null
+  %20 = load <4 x float>, <4 x float> addrspace(8)* null
   %21 = extractelement <4 x float> %20, i32 0
   %22 = fsub float -0.000000e+00, %21
   %23 = fadd float 0xFFF8000000000000, %22
diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
index 27840b2e1609..7e2291cfdc35 100644
--- a/test/CodeGen/R600/kcache-fold.ll
+++ b/test/CodeGen/R600/kcache-fold.ll
@@ -4,35 +4,35 @@
 ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
 define void @main1() {
 main_body:
-  %0 = load <4 x float> addrspace(8)* null
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
   %1 = extractelement <4 x float> %0, i32 0
-  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %3 = extractelement <4 x float> %2, i32 0
-  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %5 = extractelement <4 x float> %4, i32 0
   %6 = fcmp ogt float %1, 0.000000e+00
   %7 = select i1 %6, float %3, float %5
-  %8 = load <4 x float> addrspace(8)* null
+  %8 = load <4 x float>, <4 x float> addrspace(8)* null
   %9 = extractelement <4 x float> %8, i32 1
-  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %11 = extractelement <4 x float> %10, i32 1
-  %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %13 = extractelement <4 x float> %12, i32 1
   %14 = fcmp ogt float %9, 0.000000e+00
   %15 = select i1 %14, float %11, float %13
-  %16 = load <4 x float> addrspace(8)* null
+  %16 = load <4 x float>, <4 x float> addrspace(8)* null
   %17 = extractelement <4 x float> %16, i32 2
-  %18 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %19 = extractelement <4 x float> %18, i32 2
-  %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %21 = extractelement <4 x float> %20, i32 2
   %22 = fcmp ogt float %17, 0.000000e+00
   %23 = select i1 %22, float %19, float %21
-  %24 = load <4 x float> addrspace(8)* null
+  %24 = load <4 x float>, <4 x float> addrspace(8)* null
   %25 = extractelement <4 x float> %24, i32 3
-  %26 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %27 = extractelement <4 x float> %26, i32 3
-  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %29 = extractelement <4 x float> %28, i32 3
   %30 = fcmp ogt float %25, 0.000000e+00
   %31 = select i1 %30, float %27, float %29
@@ -52,35 +52,35 @@ main_body:
 ; CHECK-NOT: MOV
 define void @main2() {
 main_body:
-  %0 = load <4 x float> addrspace(8)* null
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
   %1 = extractelement <4 x float> %0, i32 0
-  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %3 = extractelement <4 x float> %2, i32 0
-  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %5 = extractelement <4 x float> %4, i32 1
   %6 = fcmp ogt float %1, 0.000000e+00
   %7 = select i1 %6, float %3, float %5
-  %8 = load <4 x float> addrspace(8)* null
+  %8 = load <4 x float>, <4 x float> addrspace(8)* null
   %9 = extractelement <4 x float> %8, i32 1
-  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %11 = extractelement <4 x float> %10, i32 0
-  %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %13 = extractelement <4 x float> %12, i32 1
   %14 = fcmp ogt float %9, 0.000000e+00
   %15 = select i1 %14, float %11, float %13
-  %16 = load <4 x float> addrspace(8)* null
+  %16 = load <4 x float>, <4 x float> addrspace(8)* null
   %17 = extractelement <4 x float> %16, i32 2
-  %18 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %19 = extractelement <4 x float> %18, i32 3
-  %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %21 = extractelement <4 x float> %20, i32 2
   %22 = fcmp ogt float %17, 0.000000e+00
   %23 = select i1 %22, float %19, float %21
-  %24 = load <4 x float> addrspace(8)* null
+  %24 = load <4 x float>, <4 x float> addrspace(8)* null
   %25 = extractelement <4 x float> %24, i32 3
-  %26 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %27 = extractelement <4 x float> %26, i32 3
-  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %29 = extractelement <4 x float> %28, i32 2
   %30 = fcmp ogt float %25, 0.000000e+00
   %31 = select i1 %30, float %27, float %29
diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll
index 42d289d4ef23..1dd7c2cb7995 100644
--- a/test/CodeGen/R600/kernel-args.ll
+++ b/test/CodeGen/R600/kernel-args.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
-; EG-LABEL: {{^}}i8_arg:
+; FUNC-LABEL: {{^}}i8_arg:
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-LABEL: {{^}}i8_arg:
-; SI: buffer_load_ubyte
+; GCN: buffer_load_ubyte
 
 define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
@@ -14,10 +14,10 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}i8_zext_arg:
+; FUNC-LABEL: {{^}}i8_zext_arg:
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-LABEL: {{^}}i8_zext_arg:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
@@ -26,10 +26,10 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}i8_sext_arg:
+; FUNC-LABEL: {{^}}i8_sext_arg:
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-LABEL: {{^}}i8_sext_arg:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
@@ -38,10 +38,9 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}i16_arg:
+; FUNC-LABEL: {{^}}i16_arg:
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-LABEL: {{^}}i16_arg:
-; SI: buffer_load_ushort
+; GCN: buffer_load_ushort
 
 define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
@@ -50,10 +49,10 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}i16_zext_arg:
+; FUNC-LABEL: {{^}}i16_zext_arg:
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-LABEL: {{^}}i16_zext_arg:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
@@ -62,10 +61,10 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}i16_sext_arg:
+; FUNC-LABEL: {{^}}i16_sext_arg:
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-LABEL: {{^}}i16_sext_arg:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 
 define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
@@ -74,176 +73,170 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}i32_arg:
+; FUNC-LABEL: {{^}}i32_arg:
 ; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI-LABEL: {{^}}i32_arg:
-; s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}f32_arg:
+; FUNC-LABEL: {{^}}f32_arg:
 ; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI-LABEL: {{^}}f32_arg:
-; s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v2i8_arg:
+; FUNC-LABEL: {{^}}v2i8_arg:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; SI-LABEL: {{^}}v2i8_arg:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
 define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}v2i16_arg:
+; FUNC-LABEL: {{^}}v2i16_arg:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
-; SI-LABEL: {{^}}v2i16_arg:
-; SI-DAG: buffer_load_ushort
-; SI-DAG: buffer_load_ushort
+; GCN-DAG: buffer_load_ushort
+; GCN-DAG: buffer_load_ushort
 define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}v2i32_arg:
+; FUNC-LABEL: {{^}}v2i32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI-LABEL: {{^}}v2i32_arg:
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v2f32_arg:
+; FUNC-LABEL: {{^}}v2f32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI-LABEL: {{^}}v2f32_arg:
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v3i8_arg:
+; FUNC-LABEL: {{^}}v3i8_arg:
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-; SI-LABEL: {{^}}v3i8_arg:
 define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v3i16_arg:
+; FUNC-LABEL: {{^}}v3i16_arg:
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
-; SI-LABEL: {{^}}v3i16_arg:
 define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
 }
-; EG-LABEL: {{^}}v3i32_arg:
+; FUNC-LABEL: {{^}}v3i32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI-LABEL: {{^}}v3i32_arg:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v3f32_arg:
+; FUNC-LABEL: {{^}}v3f32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI-LABEL: {{^}}v3f32_arg:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v4i8_arg:
+; FUNC-LABEL: {{^}}v4i8_arg:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; SI-LABEL: {{^}}v4i8_arg:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
 define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}v4i16_arg:
+; FUNC-LABEL: {{^}}v4i16_arg:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
-; SI-LABEL: {{^}}v4i16_arg:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
 define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}v4i32_arg:
+; FUNC-LABEL: {{^}}v4i32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI-LABEL: {{^}}v4i32_arg:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v4f32_arg:
+; FUNC-LABEL: {{^}}v4f32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI-LABEL: {{^}}v4f32_arg:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v8i8_arg:
+; FUNC-LABEL: {{^}}v8i8_arg:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
@@ -252,21 +245,20 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; SI-LABEL: {{^}}v8i8_arg:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
 define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}v8i16_arg:
+; FUNC-LABEL: {{^}}v8i16_arg:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
@@ -275,22 +267,21 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
-; SI-LABEL: {{^}}v8i16_arg:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
 define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}v8i32_arg:
+; FUNC-LABEL: {{^}}v8i32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -299,15 +290,15 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI-LABEL: {{^}}v8i32_arg:
 ; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v8f32_arg:
+; FUNC-LABEL: {{^}}v8f32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -316,7 +307,6 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI-LABEL: {{^}}v8f32_arg:
 ; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
@@ -324,7 +314,7 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}v16i8_arg:
+; FUNC-LABEL: {{^}}v16i8_arg:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
@@ -341,30 +331,29 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; SI-LABEL: {{^}}v16i8_arg:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
 define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}v16i16_arg:
+; FUNC-LABEL: {{^}}v16i16_arg:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
@@ -381,30 +370,29 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
-; SI-LABEL: {{^}}v16i16_arg:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
 define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}v16i32_arg:
+; FUNC-LABEL: {{^}}v16i32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
@@ -421,15 +409,15 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI-LABEL: {{^}}v16i32_arg:
 ; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}v16f32_arg:
+; FUNC-LABEL: {{^}}v16f32_arg:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
@@ -446,8 +434,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI-LABEL: {{^}}v16f32_arg:
 ; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
@@ -455,18 +443,30 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}kernel_arg_i64:
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: buffer_store_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: buffer_store_dwordx2
 define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
+; FUNC-LABEL: {{^}}f64_kernel_arg:
+; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
+; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
+; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
+; GCN: buffer_store_dwordx2
+define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+entry:
+  store double %in, double addrspace(1)* %out
+  ret void
+}
+
 ; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
-; XSI: s_load_dwordx2
-; XSI: s_load_dwordx2
-; XSI: buffer_store_dwordx2
+; XGCN: s_load_dwordx2
+; XGCN: s_load_dwordx2
+; XGCN: buffer_store_dwordx2
 ; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
 ;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
 ;   ret void
diff --git a/test/CodeGen/R600/large-alloca.ll b/test/CodeGen/R600/large-alloca.ll
index 788816cf723f..671833d1a33a 100644
--- a/test/CodeGen/R600/large-alloca.ll
+++ b/test/CodeGen/R600/large-alloca.ll
@@ -5,10 +5,10 @@
 
 define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
   %large = alloca [8192 x i32], align 4
-  %gep = getelementptr [8192 x i32]* %large, i32 0, i32 8191
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
   store i32 %x, i32* %gep
-  %gep1 = getelementptr [8192 x i32]* %large, i32 0, i32 %y
-  %0 = load i32* %gep1
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %0 = load i32, i32* %gep1
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll
index c8671efbe6f9..9975b1b7f5cc 100644
--- a/test/CodeGen/R600/large-constant-initializer.ll
+++ b/test/CodeGen/R600/large-constant-initializer.ll
@@ -5,7 +5,7 @@
 @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
 
 define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
-  %val = load i32 addrspace(2)* getelementptr ([239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
+  %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
   %mul12 = mul nsw i32 %val, 7
   br i1 undef, label %exit, label %bb
 
diff --git a/test/CodeGen/R600/lds-initializer.ll b/test/CodeGen/R600/lds-initializer.ll
index 7344eff2572f..bf8df63be9fd 100644
--- a/test/CodeGen/R600/lds-initializer.ll
+++ b/test/CodeGen/R600/lds-initializer.ll
@@ -6,8 +6,8 @@
 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
 
 define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
- %gep = getelementptr [8 x i32] addrspace(3)* @lds, i32 0, i32 10
-  %ld = load i32 addrspace(3)* %gep
+ %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/lds-oqap-crash.ll b/test/CodeGen/R600/lds-oqap-crash.ll
index fbcd778de2c2..6ff6fc3d7afc 100644
--- a/test/CodeGen/R600/lds-oqap-crash.ll
+++ b/test/CodeGen/R600/lds-oqap-crash.ll
@@ -12,7 +12,7 @@
 ; CHECK: {{^}}lds_crash:
 define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
-  %0 = load i32 addrspace(3)* %in
+  %0 = load i32, i32 addrspace(3)* %in
   ; This block needs to be > 115 ISA instructions to hit the bug,
   ; so we'll use udiv instructions.
   %div0 = udiv i32 %0, %b
diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll
index cda75b0e0ccc..44ffc36af149 100644
--- a/test/CodeGen/R600/lds-output-queue.ll
+++ b/test/CodeGen/R600/lds-output-queue.ll
@@ -12,12 +12,12 @@
 
 define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
 entry:
-  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
-  %1 = load i32 addrspace(3)* %0
+  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+  %1 = load i32, i32 addrspace(3)* %0
   call void @llvm.AMDGPU.barrier.local()
 
   ; This will start a new clause for the vertex fetch
-  %2 = load i32 addrspace(1)* %in
+  %2 = load i32, i32 addrspace(1)* %in
   %3 = add i32 %1, %2
   store i32 %3, i32 addrspace(1)* %out
   ret void
@@ -40,9 +40,9 @@ declare void @llvm.AMDGPU.barrier.local()
 ; load from global memory which immediately follows a load from a global value that
 ; has been declared in the local memory space:
 ;
-;  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
-;  %1 = load i32 addrspace(3)* %0
-;  %2 = load i32 addrspace(1)* %in
+;  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+;  %1 = load i32, i32 addrspace(3)* %0
+;  %2 = load i32, i32 addrspace(1)* %in
 ;
 ; The instruction selection phase will generate ISA that looks like this:
 ; %OQAP = LDS_READ_RET
@@ -90,9 +90,9 @@ declare void @llvm.AMDGPU.barrier.local()
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
 define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
-  %1 = load i32 addrspace(3)* %0
-  %2 = load i32 addrspace(1)* %in
+  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
+  %1 = load i32, i32 addrspace(3)* %0
+  %2 = load i32, i32 addrspace(1)* %in
   %3 = add i32 %2, %1
   store i32 %3, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll
index 5287723ce191..3e8328659fdb 100644
--- a/test/CodeGen/R600/lds-size.ll
+++ b/test/CodeGen/R600/lds-size.ll
@@ -3,9 +3,9 @@
 ; This test makes sure we do not double count global values when they are
 ; used in different basic blocks.
 
-; CHECK-LABEL: {{^}}test:
 ; CHECK: .long   166120
 ; CHECK-NEXT: .long   1
+; CHECK-LABEL: {{^}}test:
 @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
 define void @test(i32 addrspace(1)* %out, i32 %cond) {
diff --git a/test/CodeGen/R600/lds-zero-initializer.ll b/test/CodeGen/R600/lds-zero-initializer.ll
index 1fb6f52f29b9..fb51bc0e50c2 100644
--- a/test/CodeGen/R600/lds-zero-initializer.ll
+++ b/test/CodeGen/R600/lds-zero-initializer.ll
@@ -6,8 +6,8 @@
 @lds = addrspace(3) global [256 x i32] zeroinitializer
 
 define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
- %gep = getelementptr [256 x i32] addrspace(3)* @lds, i32 0, i32 10
-  %ld = load i32 addrspace(3)* %gep
+ %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
index b9fa8e938ae4..4244c48d240e 100644
--- a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
+++ b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
@@ -16,7 +16,7 @@ entry:
   %0 = icmp eq i32 %in, 5
   br i1 %0, label %IF, label %ENDIF
 IF:
-  %1 = getelementptr i32 addrspace(1)* %out, i32 1
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   store i32 0, i32 addrspace(1)* %1
   br label %ENDIF
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.abs.ll b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
index 8bc2583899bd..8bf094b8bc7b 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.abs.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
@@ -28,7 +28,7 @@ define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind {
 ; EG: SUB_INT
 ; EG: MAX_INT
 define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
-  %val = load i32 addrspace(1)* %src, align 4
+  %val = load i32, i32 addrspace(1)* %src, align 4
   %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone
   store i32 %abs, i32 addrspace(1)* %out, align 4
   ret void
@@ -42,7 +42,7 @@ define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind
 ; EG: SUB_INT
 ; EG: MAX_INT
 define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
-  %val = load i32 addrspace(1)* %src, align 4
+  %val = load i32, i32 addrspace(1)* %src, align 4
   %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone
   store i32 %abs, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
index a11d9ae7af08..db883972d646 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
@@ -10,14 +10,14 @@
 define void @test_barrier_global(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x()
-  %1 = getelementptr i32 addrspace(1)* %out, i32 %0
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
   store i32 %0, i32 addrspace(1)* %1
   call void @llvm.AMDGPU.barrier.global()
   %2 = call i32 @llvm.r600.read.local.size.x()
   %3 = sub i32 %2, 1
   %4 = sub i32 %3, %0
-  %5 = getelementptr i32 addrspace(1)* %out, i32 %4
-  %6 = load i32 addrspace(1)* %5
+  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
+  %6 = load i32, i32 addrspace(1)* %5
   store i32 %6, i32 addrspace(1)* %1
   ret void
 }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
index 76c2453d089f..48fb2e0b1a8d 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
@@ -11,14 +11,14 @@
 define void @test_barrier_local(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x()
-  %1 = getelementptr i32 addrspace(1)* %out, i32 %0
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
   store i32 %0, i32 addrspace(1)* %1
   call void @llvm.AMDGPU.barrier.local()
   %2 = call i32 @llvm.r600.read.local.size.x()
   %3 = sub i32 %2, 1
   %4 = sub i32 %3, %0
-  %5 = getelementptr i32 addrspace(1)* %out, i32 %4
-  %6 = load i32 addrspace(1)* %5
+  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
+  %6 = load i32, i32 addrspace(1)* %5
   store i32 %6, i32 addrspace(1)* %1
   ret void
 }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
index 2ec2546be39b..1168713ca66e 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
@@ -44,7 +44,7 @@ define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
 ; FUNC-LABEL: {{^}}v_bfe_print_arg:
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
 define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
-  %load = load i32 addrspace(1)* %src0, align 4
+  %load = load i32, i32 addrspace(1)* %src0, align 4
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -75,7 +75,7 @@ define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI: s_endpgm
 define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -89,20 +89,19 @@ define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FIXME: The shifts should be 1 BFE
 ; FUNC-LABEL: {{^}}bfe_i32_test_8:
 ; SI: buffer_load_dword
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; SI: s_endpgm
 define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -115,7 +114,7 @@ define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
@@ -127,7 +126,7 @@ define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
@@ -139,7 +138,7 @@ define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
@@ -151,7 +150,7 @@ define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
@@ -162,7 +161,7 @@ define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
@@ -173,7 +172,7 @@ define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
@@ -407,18 +406,14 @@ define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; XXX - This should really be a single BFE, but the sext_inreg of the
-; extended type i24 is never custom lowered.
 ; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24:
 ; SI: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI: v_lshlrev_b32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
-; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
-; XSI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 8
-; XSI-NOT: SHL
-; XSI-NOT: SHR
-; XSI: buffer_store_dword [[BFE]],
+; SI-NOT: v_lshl
+; SI-NOT: v_ashr
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
+; SI: buffer_store_dword [[BFE]],
 define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
   %shl = shl i32 %bfe, 8
   %ashr = ashr i32 %shl, 8
@@ -434,7 +429,7 @@ define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
 ; SI: buffer_store_dword [[TMP2]]
 define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %src = load i32 addrspace(1)* %in, align 4
+  %src = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
   %div = sdiv i32 %bfe, 2
   store i32 %div, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
index 6cd0108def2d..541119242a94 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
@@ -65,7 +65,7 @@ define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %load = load i8 addrspace(1)* %in
+  %load = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %load to i32
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -79,7 +79,7 @@ define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) n
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
@@ -94,7 +94,7 @@ define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16)
@@ -108,7 +108,7 @@ define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; SI: bfe
 ; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8)
@@ -123,7 +123,7 @@ define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspa
 ; SI-NEXT: bfe
 ; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8)
@@ -138,7 +138,7 @@ define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspa
 ; SI-NEXT: bfe
 ; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8)
@@ -152,7 +152,7 @@ define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspa
 ; SI-NEXT: bfe
 ; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8)
@@ -166,14 +166,14 @@ define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrsp
 ; SI: s_endpgm
 ; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
 define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -181,7 +181,7 @@ define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 }
 
 define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -196,7 +196,7 @@ define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = lshr i32 %shl, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1)
@@ -211,7 +211,7 @@ define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
 ; SI: s_endpgm
 define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1)
@@ -224,7 +224,7 @@ define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI: s_endpgm
 define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -236,7 +236,7 @@ define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -249,7 +249,7 @@ define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -262,7 +262,7 @@ define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
@@ -274,7 +274,7 @@ define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
@@ -286,7 +286,7 @@ define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
@@ -298,7 +298,7 @@ define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
@@ -309,7 +309,7 @@ define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
@@ -320,7 +320,7 @@ define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
@@ -439,7 +439,7 @@ define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
-; EG-NOT: BFEfppppppppppppp
+; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
@@ -568,10 +568,60 @@ define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
 define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
                                             i32 addrspace(1)* %out1,
                                             i32 addrspace(1)* %in) nounwind {
-  %src = load i32 addrspace(1)* %in, align 4
+  %src = load i32, i32 addrspace(1)* %in, align 4
   %and = and i32 %src, 63
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
   store i32 %and, i32 addrspace(1)* %out1, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}lshr_and:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; SI: buffer_store_dword
+define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = lshr i32 %a, 6
+  %c = and i32 %b, 7
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_lshr_and:
+; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
+; SI: buffer_store_dword
+define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = lshr i32 %a, %b
+  %d = and i32 %c, 7
+  store i32 %d, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}and_lshr:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; SI: buffer_store_dword
+define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = and i32 %a, 448
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}and_lshr2:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; SI: buffer_store_dword
+define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = and i32 %a, 511
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}shl_lshr:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
+; SI: buffer_store_dword
+define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = shl i32 %a, 9
+  %c = lshr i32 %b, 11
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
index 2346f408ec44..50492289d744 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
@@ -5,7 +5,7 @@
 declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
 
 ; FUNC-LABEL: {{^}}bfm_arg_arg:
-; SI: v_bfm
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; EG: BFM_INT
 define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone
@@ -14,7 +14,7 @@ define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind
 }
 
 ; FUNC-LABEL: {{^}}bfm_arg_imm:
-; SI: v_bfm
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b
 ; EG: BFM_INT
 define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone
@@ -23,7 +23,7 @@ define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}bfm_imm_arg:
-; SI: v_bfm
+; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}}
 ; EG: BFM_INT
 define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone
@@ -32,10 +32,29 @@ define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}bfm_imm_imm:
-; SI: v_bfm
+; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8
 ; EG: BFM_INT
 define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone
   store i32 %bfm, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}bfm_pattern:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+  %a = shl i32 1, %x
+  %b = sub i32 %a, 1
+  %c = shl i32 %b, %y
+  store i32 %c, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_pattern_simple:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
+define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) {
+  %a = shl i32 1, %x
+  %b = sub i32 %a, 1
+  store i32 %b, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.brev.ll b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
index 3973f539c135..301de4b1c82d 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.brev.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
@@ -21,7 +21,7 @@ define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32 addrspace(1)* %valptr, align 4
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/llvm.AMDGPU.class.ll b/test/CodeGen/R600/llvm.AMDGPU.class.ll
index 974e3c71e622..805a88b59c72 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.class.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.class.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
 declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
@@ -134,9 +134,9 @@ define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI: s_endpgm
 define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
 
   %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
   %sext = sext i1 %result to i32
@@ -152,9 +152,9 @@ define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(
 ; SI: s_endpgm
 define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %b = load i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
 
   %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -172,9 +172,9 @@ define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %
 ; SI: s_endpgm
 define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %b = load i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
 
   %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -290,9 +290,9 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: s_endpgm
 define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load double addrspace(1)* %in
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %in
 
   %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
@@ -306,9 +306,9 @@ define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace
 ; SI: s_endpgm
 define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %b = load i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
 
   %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -321,9 +321,9 @@ define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %
 ; SI: s_endpgm
 define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %b = load i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
 
   %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -338,9 +338,9 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3
 ; SI: s_endpgm
 define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
 
   %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
   %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
@@ -358,9 +358,9 @@ define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: s_endpgm
 define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
 
   %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
   %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
@@ -381,9 +381,9 @@ define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1
 ; SI: s_endpgm
 define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
 
   %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
   %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
@@ -416,9 +416,9 @@ define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float ad
 ; SI: s_endpgm
 define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
 
   %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
   %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
@@ -436,9 +436,9 @@ define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: s_endpgm
 define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
 
   %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
   %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
@@ -456,9 +456,9 @@ define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: s_endpgm
 define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
-  %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
 
   %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
   %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
@@ -472,7 +472,7 @@ define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace
 ; SI-LABEL: {{^}}test_class_0_f32:
 ; SI-NOT: v_cmp_class
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cube.ll b/test/CodeGen/R600/llvm.AMDGPU.cube.ll
index aa07afdebea6..e95a51093cb7 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.cube.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.cube.ll
@@ -8,15 +8,15 @@
 ; CHECK: CUBE * T{{[0-9]}}.W
 define void @cube() #0 {
 main_body:
-  %0 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
   %1 = extractelement <4 x float> %0, i32 3
-  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
   %3 = extractelement <4 x float> %2, i32 0
   %4 = fdiv float %3, %1
-  %5 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
   %6 = extractelement <4 x float> %5, i32 1
   %7 = fdiv float %6, %1
-  %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
   %9 = extractelement <4 x float> %8, i32 2
   %10 = fdiv float %9, %1
   %11 = insertelement <4 x float> undef, float %4, i32 0
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
index 799817e01096..8b32f696449e 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
@@ -9,7 +9,7 @@ declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
 ; SI-LABEL: {{^}}test_unpack_byte0_to_float:
 ; SI: v_cvt_f32_ubyte0
 define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
   ret void
@@ -18,7 +18,7 @@ define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(
 ; SI-LABEL: {{^}}test_unpack_byte1_to_float:
 ; SI: v_cvt_f32_ubyte1
 define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(
 ; SI-LABEL: {{^}}test_unpack_byte2_to_float:
 ; SI: v_cvt_f32_ubyte2
 define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
   ret void
@@ -36,7 +36,7 @@ define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(
 ; SI-LABEL: {{^}}test_unpack_byte3_to_float:
 ; SI: v_cvt_f32_ubyte3
 define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
index 52d0519ef277..55ca9c7536e5 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
@@ -1,25 +1,29 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
 declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
 
-; SI-LABEL: {{^}}test_div_fixup_f32:
+; GCN-LABEL: {{^}}test_div_fixup_f32:
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_div_fixup_f64:
-; SI: v_div_fixup_f64
+; GCN-LABEL: {{^}}test_div_fixup_f64:
+; GCN: v_div_fixup_f64
 define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
   %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
index bbe910a45e8f..bcb7f870f1f4 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
@@ -9,13 +9,16 @@ declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readno
 declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32:
-; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; GCN-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
 ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
+; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
@@ -24,8 +27,50 @@ define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, flo
   ret void
 }
 
-; SI-LABEL: {{^}}test_div_fmas_f64:
-; SI: v_div_fmas_f64
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f64:
+; GCN: v_div_fmas_f64
 define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
   %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
@@ -33,7 +78,7 @@ define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b,
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
-; SI: v_cmp_eq_i32_e64 vcc, s{{[0-9]+}}, 0
+; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
@@ -61,25 +106,25 @@ define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, fl
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
-; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
-; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0
-; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
-; SI: v_div_fmas_f32 {{v[0-9]+}}, [[B]], [[A]], [[C]]
+; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
+; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
+; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
+; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
 ; SI: s_endpgm
 define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.a = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.b = getelementptr float addrspace(1)* %gep.a, i32 1
-  %gep.c = getelementptr float addrspace(1)* %gep.a, i32 2
-  %gep.out = getelementptr float addrspace(1)* %out, i32 2
+  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
+  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
 
-  %a = load float addrspace(1)* %gep.a
-  %b = load float addrspace(1)* %gep.b
-  %c = load float addrspace(1)* %gep.c
+  %a = load float, float addrspace(1)* %gep.a
+  %b = load float, float addrspace(1)* %gep.b
+  %c = load float, float addrspace(1)* %gep.c
 
   %cmp0 = icmp eq i32 %tid, 0
   %cmp1 = icmp ne i32 %d, 0
@@ -91,17 +136,17 @@ define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, flo
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-; SI: v_cmp_eq_i32_e64 [[CMPTID:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
-; SI: s_and_saveexec_b64 [[CMPTID]], [[CMPTID]]
-; SI: s_xor_b64 [[CMPTID]], exec, [[CMPTID]]
+; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]]
 
 ; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_cmp_ne_i32_e64 [[CMPLOAD:s\[[0-9]+:[0-9]+\]]], [[LOAD]], 0
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, [[CMPLOAD]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
 
 
-; SI: BB6_2:
-; SI: s_or_b64 exec, exec, [[CMPTID]]
+; SI: BB9_2:
+; SI: s_or_b64 exec, exec, [[SAVE]]
 ; SI: v_cmp_ne_i32_e32 vcc, 0, v0
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword
@@ -109,20 +154,20 @@ define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, flo
 define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.out = getelementptr float addrspace(1)* %out, i32 2
-  %gep.a = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.b = getelementptr float addrspace(1)* %gep.a, i32 1
-  %gep.c = getelementptr float addrspace(1)* %gep.a, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
+  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
+  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
 
-  %a = load float addrspace(1)* %gep.a
-  %b = load float addrspace(1)* %gep.b
-  %c = load float addrspace(1)* %gep.c
+  %a = load float, float addrspace(1)* %gep.a
+  %b = load float, float addrspace(1)* %gep.b
+  %c = load float, float addrspace(1)* %gep.c
 
   %cmp0 = icmp eq i32 %tid, 0
   br i1 %cmp0, label %bb, label %exit
 
 bb:
-  %val = load i32 addrspace(1)* %dummy
+  %val = load i32, i32 addrspace(1)* %dummy
   %cmp1 = icmp ne i32 %val, 0
   br label %exit
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
index 5773da0bb2e4..de830de039c7 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
@@ -13,11 +13,11 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; SI: s_endpgm
 define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
@@ -33,11 +33,11 @@ define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)*
 ; SI: s_endpgm
 define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
@@ -53,11 +53,11 @@ define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)*
 ; SI: s_endpgm
 define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
@@ -73,11 +73,11 @@ define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)
 ; SI: s_endpgm
 define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
 
-  %a = load double addrspace(1)* %gep.0, align 8
-  %b = load double addrspace(1)* %gep.1, align 8
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
 
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
@@ -93,9 +93,9 @@ define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)
 ; SI: s_endpgm
 define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
-  %b = load float addrspace(1)* %gep, align 4
+  %b = load float, float addrspace(1)* %gep, align 4
 
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
@@ -111,9 +111,9 @@ define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float add
 ; SI: s_endpgm
 define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
-  %b = load float addrspace(1)* %gep, align 4
+  %b = load float, float addrspace(1)* %gep, align 4
 
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
@@ -129,9 +129,9 @@ define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float add
 ; SI: s_endpgm
 define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
-  %a = load float addrspace(1)* %gep, align 4
+  %a = load float, float addrspace(1)* %gep, align 4
 
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
@@ -147,9 +147,9 @@ define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float add
 ; SI: s_endpgm
 define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
-  %a = load float addrspace(1)* %gep, align 4
+  %a = load float, float addrspace(1)* %gep, align 4
 
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
@@ -165,9 +165,9 @@ define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float add
 ; SI: s_endpgm
 define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
-  %b = load double addrspace(1)* %gep, align 8
+  %b = load double, double addrspace(1)* %gep, align 8
 
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
@@ -183,9 +183,9 @@ define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double a
 ; SI: s_endpgm
 define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
-  %b = load double addrspace(1)* %gep, align 8
+  %b = load double, double addrspace(1)* %gep, align 8
 
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
@@ -201,9 +201,9 @@ define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double a
 ; SI: s_endpgm
 define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
-  %a = load double addrspace(1)* %gep, align 8
+  %a = load double, double addrspace(1)* %gep, align 8
 
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
@@ -219,9 +219,9 @@ define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double a
 ; SI: s_endpgm
 define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
-  %a = load double addrspace(1)* %gep, align 8
+  %a = load double, double addrspace(1)* %gep, align 8
 
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
@@ -294,8 +294,8 @@ define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %
 ; SI: s_endpgm
 define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %a = load float addrspace(1)* %gep.0, align 4
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %a = load float, float addrspace(1)* %gep.0, align 4
 
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
@@ -310,8 +310,8 @@ define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float a
 ; SI: s_endpgm
 define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %a = load float addrspace(1)* %gep.0, align 4
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %a = load float, float addrspace(1)* %gep.0, align 4
 
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
@@ -327,11 +327,11 @@ define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float a
 ; SI: s_endpgm
 define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
 
@@ -349,11 +349,11 @@ define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspa
 ; SI: s_endpgm
 define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
 
-  %a = load float addrspace(1)* %gep.0, align 4
-  %b = load float addrspace(1)* %gep.1, align 4
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
 
   %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll
index 19fbee8913b4..20c7af8ade5e 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll
@@ -21,7 +21,7 @@ define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32 addrspace(1)* %valptr, align 4
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
   %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
   store i32 %r, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll
new file mode 100644
index 000000000000..e098dd35d6da
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.fabs.f64(double %Val)
+declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone
+
+; FUNC-LABEL: {{^}}fract_f64:
+; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
+; CI: buffer_store_dwordx2 [[FRC]]
+define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
+  %val = load double, double addrspace(1)* %src, align 4
+  %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone
+  store double %fract, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg:
+; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
+; CI: buffer_store_dwordx2 [[FRC]]
+define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
+  %val = load double, double addrspace(1)* %src, align 4
+  %neg = fsub double 0.0, %val
+  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
+  store double %fract, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg_abs:
+; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
+; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
+; CI: buffer_store_dwordx2 [[FRC]]
+define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
+  %val = load double, double addrspace(1)* %src, align 4
+  %abs = call double @llvm.fabs.f64(double %val)
+  %neg = fsub double 0.0, %abs
+  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
+  store double %fract, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
index ef89742441c6..7501b4b75465 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.fract.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
@@ -16,7 +16,7 @@ declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
 ; GCN: buffer_store_dword [[RESULT]]
 ; EG: FRACT
 define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float addrspace(1)* %src, align 4
+  %val = load float, float addrspace(1)* %src, align 4
   %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone
   store float %fract, float addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounw
 ; GCN: buffer_store_dword [[RESULT]]
 ; EG: FRACT
 define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float addrspace(1)* %src, align 4
+  %val = load float, float addrspace(1)* %src, align 4
   %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone
   store float %fract, float addrspace(1)* %out, align 4
   ret void
@@ -42,7 +42,7 @@ define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)
 ; GCN: buffer_store_dword [[RESULT]]
 ; EG: FRACT
 define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float addrspace(1)* %src, align 4
+  %val = load float, float addrspace(1)* %src, align 4
   %neg = fsub float 0.0, %val
   %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
   store float %fract, float addrspace(1)* %out, align 4
@@ -56,7 +56,7 @@ define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) n
 ; GCN: buffer_store_dword [[RESULT]]
 ; EG: FRACT
 define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
-  %val = load float addrspace(1)* %src, align 4
+  %val = load float, float addrspace(1)* %src, align 4
   %abs = call float @llvm.fabs.f32(float %val)
   %neg = fsub float 0.0, %abs
   %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
index ce7fca056a02..46662f96c290 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
@@ -5,7 +5,7 @@
 ; SI: v_max_i32_e32
 define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load)
   %bc = bitcast i32 %max to float
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
index 15cd38b19d7e..34b454e23755 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
@@ -5,7 +5,7 @@
 ; SI: v_min_i32_e32
 define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load)
   %bc = bitcast i32 %min to float
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
diff --git a/test/CodeGen/R600/llvm.AMDGPU.kill.ll b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
index 30b0fc2bd73b..057708e7b5cc 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.kill.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
@@ -18,8 +18,8 @@ main_body:
 
 ; SI-LABEL: {{^}}kill_vcc_implicit_def:
 ; SI-NOT: v_cmp_gt_f32_e32 vcc,
+; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
 ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
-; SI: v_cmp_lt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
 define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 {
 entry:
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
index 4318aeaac786..67f1d22c7178 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
@@ -1,9 +1,21 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
 
 declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
 
 ; FUNC-LABEL: {{^}}rsq_clamped_f64:
 ; SI: v_rsq_clamp_f64_e32
+
+; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
+; TODO: this constant should be folded:
+; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
+; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
+; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
+; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
+; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
+; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
+; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
+
 define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
   %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
   store double %rsq_clamped, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
index 9336baffc97f..eeff2536b232 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
@@ -6,7 +7,15 @@ declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
 
 ; FUNC-LABEL: {{^}}rsq_clamped_f32:
 ; SI: v_rsq_clamp_f32_e32
+
+; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}}
+; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
+; TODO: this constant should be folded:
+; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff
+; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]]
+
 ; EG: RECIPSQRT_CLAMPED
+
 define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
   store float %rsq_clamped, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/llvm.AMDGPU.tex.ll b/test/CodeGen/R600/llvm.AMDGPU.tex.ll
index aac014bde456..10206609bb57 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.tex.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.tex.ll
@@ -18,7 +18,7 @@
 ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
 
 define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-   %addr = load <4 x float> addrspace(1)* %in
+   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
    %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1)
    %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2)
    %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3)
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
index 5829f7348df9..6b546a7e17c1 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
@@ -10,8 +10,8 @@ declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
 define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load double addrspace(1)* %aptr, align 8
-  %b = load i32 addrspace(1)* %bptr, align 4
+  %a = load double, double addrspace(1)* %aptr, align 8
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -23,7 +23,7 @@ define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)*
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
 define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
-  %a = load double addrspace(1)* %aptr, align 8
+  %a = load double, double addrspace(1)* %aptr, align 8
   %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
index 88613db2161f..77a073b0cb03 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
@@ -25,12 +25,12 @@ define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2
 ; SI: buffer_store_dword [[RESULT]]
 define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %src0.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %src2.gep = getelementptr i32 addrspace(1)* %src0.gep, i32 1
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1
 
-  %src0 = load i32 addrspace(1)* %src0.gep, align 4
-  %src2 = load i32 addrspace(1)* %src2.gep, align 4
+  %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4
+  %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4
   %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
   store i32 %mad, i32 addrspace(1)* %out.gep, align 4
   ret void
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
index 4320dfe669d8..a97d103016d3 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
@@ -5,7 +5,7 @@
 ; SI: v_max_u32_e32
 define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load)
   %bc = bitcast i32 %max to float
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
@@ -28,7 +28,7 @@ entry:
 ; SI-NOT: and
 ; SI: buffer_store_short [[RESULT]],
 define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
-  %tmp5 = load i8 addrspace(1)* %src, align 1
+  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
   %tmp2 = zext i8 %tmp5 to i32
   %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
   %tmp4 = trunc i32 %tmp3 to i8
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
index e4cac33a07a7..2acd10e0c631 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
@@ -5,7 +5,7 @@
 ; SI: v_min_u32_e32
 define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
-  %load = load i32 addrspace(1)* %in, align 4
+  %load = load i32, i32 addrspace(1)* %in, align 4
   %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load)
   %bc = bitcast i32 %min to float
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
@@ -28,7 +28,7 @@ entry:
 ; SI-NOT: and
 ; SI: buffer_store_short [[RESULT]],
 define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
-  %tmp5 = load i8 addrspace(1)* %src, align 1
+  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
   %tmp2 = zext i8 %tmp5 to i32
   %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
   %tmp4 = trunc i32 %tmp3 to i8
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.ll b/test/CodeGen/R600/llvm.SI.fs.interp.ll
index 9f87a41de247..3d05da616e4e 100644
--- a/test/CodeGen/R600/llvm.SI.fs.interp.ll
+++ b/test/CodeGen/R600/llvm.SI.fs.interp.ll
@@ -1,11 +1,13 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
 
-;CHECK-NOT: s_wqm
-;CHECK: s_mov_b32
-;CHECK: v_interp_p1_f32
-;CHECK: v_interp_p2_f32
-;CHECK: v_interp_mov_f32
+;GCN-LABEL: {{^}}main:
+;GCN-NOT: s_wqm
+;GCN: s_mov_b32
+;GCN-NEXT: v_interp_mov_f32
+;GCN: v_interp_p1_f32
+;GCN: v_interp_p2_f32
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
 main_body:
@@ -16,7 +18,33 @@ main_body:
   ret void
 }
 
-declare void @llvm.AMDGPU.shader.type(i32)
+; Thest that v_interp_p1 uses different source and destination registers
+; on 16 bank LDS chips.
+
+; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug:
+; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
+
+define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+main_body:
+  %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7)
+  %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
+  %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7)
+  %25 = call float @fabs(float %22)
+  %26 = call float @fabs(float %23)
+  %27 = call float @fabs(float %24)
+  %28 = call i32 @llvm.SI.packf16(float %25, float %26)
+  %29 = bitcast i32 %28 to float
+  %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00)
+  %31 = bitcast i32 %30 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @fabs(float) #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
 
 ; Function Attrs: nounwind readnone
 declare float @llvm.SI.fs.constant(i32, i32, i32) #1
@@ -28,3 +56,4 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 
 attributes #0 = { "ShaderType"="0" }
 attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll
index 35e4591bb1fa..b67716c3b665 100644
--- a/test/CodeGen/R600/llvm.SI.imageload.ll
+++ b/test/CodeGen/R600/llvm.SI.imageload.ll
@@ -88,16 +88,16 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
 ; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
 define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %20 = getelementptr float addrspace(2)* addrspace(2)* %0, i32 0
-  %21 = load float addrspace(2)* addrspace(2)* %20, !tbaa !2
-  %22 = getelementptr float addrspace(2)* %21, i32 0
-  %23 = load float addrspace(2)* %22, !tbaa !2, !invariant.load !1
-  %24 = getelementptr float addrspace(2)* %21, i32 1
-  %25 = load float addrspace(2)* %24, !tbaa !2, !invariant.load !1
-  %26 = getelementptr float addrspace(2)* %21, i32 4
-  %27 = load float addrspace(2)* %26, !tbaa !2, !invariant.load !1
-  %28 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
-  %29 = load <32 x i8> addrspace(2)* %28, !tbaa !2
+  %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0
+  %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2
+  %22 = getelementptr float, float addrspace(2)* %21, i32 0
+  %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1
+  %24 = getelementptr float, float addrspace(2)* %21, i32 1
+  %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1
+  %26 = getelementptr float, float addrspace(2)* %21, i32 4
+  %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1
+  %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
+  %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2
   %30 = bitcast float %27 to i32
   %31 = bitcast float %23 to i32
   %32 = bitcast float %25 to i32
diff --git a/test/CodeGen/R600/llvm.SI.load.dword.ll b/test/CodeGen/R600/llvm.SI.load.dword.ll
index 8c8f2eed7d9d..f6c258539d5b 100644
--- a/test/CodeGen/R600/llvm.SI.load.dword.ll
+++ b/test/CodeGen/R600/llvm.SI.load.dword.ll
@@ -1,29 +1,41 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
 
 ; Example of a simple geometry shader loading vertex attributes from the
 ; ESGS ring buffer
 
-; CHECK-LABEL: {{^}}main:
-; CHECK: buffer_load_dword
-; CHECK: buffer_load_dword
-; CHECK: buffer_load_dword
-; CHECK: buffer_load_dword
+; FIXME: Out of bounds immediate offset crashes
 
-define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, [2 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* inreg, [17 x <16 x i8>] addrspace(2)* inreg, i32, i32, i32, i32) #0 {
+; CHECK-LABEL: {{^}}main:
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc
+; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
+
+define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 {
 main_body:
-  %10 = getelementptr [2 x <16 x i8>] addrspace(2)* %3, i64 0, i32 1
-  %11 = load <16 x i8> addrspace(2)* %10, !tbaa !0
-  %12 = shl i32 %6, 2
-  %13 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
-  %14 = bitcast i32 %13 to float
-  %15 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %11, i32 %12, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
-  %16 = bitcast i32 %15 to float
-  %17 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %11, i32 %12, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
-  %18 = bitcast i32 %17 to float
-  %19 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %11, <2 x i32> <i32 0, i32 0>, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
-  %20 = bitcast i32 %19 to float
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %14, float %16, float %18, float %20)
+  %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1
+  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp11 = shl i32 %arg6, 2
+  %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
+  %tmp13 = bitcast i32 %tmp12 to float
+  %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp15 = bitcast i32 %tmp14 to float
+  %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
+  %tmp17 = bitcast i32 %tmp16 to float
+  %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp19 = bitcast i32 %tmp18 to float
+
+  %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp21 = bitcast i32 %tmp20 to float
+
+  %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp23 = bitcast i32 %tmp22 to float
+
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23)
   ret void
 }
 
diff --git a/test/CodeGen/R600/llvm.SI.sendmsg.ll b/test/CodeGen/R600/llvm.SI.sendmsg.ll
index ce3800241953..09675d503355 100644
--- a/test/CodeGen/R600/llvm.SI.sendmsg.ll
+++ b/test/CodeGen/R600/llvm.SI.sendmsg.ll
@@ -2,6 +2,8 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ; CHECK-LABEL: {{^}}main:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
 ; CHECK: s_sendmsg Gs(emit stream 0)
 ; CHECK: s_sendmsg Gs(cut stream 1)
 ; CHECK: s_sendmsg Gs(emit-cut stream 2)
diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll
index 64efd2daf338..f6e6d7050ba7 100644
--- a/test/CodeGen/R600/llvm.SI.tid.ll
+++ b/test/CodeGen/R600/llvm.SI.tid.ll
@@ -1,7 +1,9 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
 
-;CHECK: v_mbcnt_lo_u32_b32_e64
-;CHECK: v_mbcnt_hi_u32_b32_e32
+;GCN: v_mbcnt_lo_u32_b32_e64
+;SI: v_mbcnt_hi_u32_b32_e32
+;VI: v_mbcnt_hi_u32_b32_e64
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:
diff --git a/test/CodeGen/R600/llvm.amdgpu.dp4.ll b/test/CodeGen/R600/llvm.amdgpu.dp4.ll
index 812b6a40ee59..036cd2ca82a6 100644
--- a/test/CodeGen/R600/llvm.amdgpu.dp4.ll
+++ b/test/CodeGen/R600/llvm.amdgpu.dp4.ll
@@ -3,8 +3,8 @@
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
 
 define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
-  %src0 = load <4 x float> addrspace(1)* %a, align 16
-  %src1 = load <4 x float> addrspace(1)* %b, align 16
+  %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
+  %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
   %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
   store float %dp4, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/llvm.floor.ll b/test/CodeGen/R600/llvm.floor.ll
deleted file mode 100644
index 1016ff75ce9b..000000000000
--- a/test/CodeGen/R600/llvm.floor.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
-
-; R600-CHECK: {{^}}f32:
-; R600-CHECK: FLOOR
-; SI-CHECK: {{^}}f32:
-; SI-CHECK: v_floor_f32_e32
-define void @f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.floor.f32(float %in)
-  store float %0, float addrspace(1)* %out
-  ret void
-}
-
-; R600-CHECK: {{^}}v2f32:
-; R600-CHECK: FLOOR
-; R600-CHECK: FLOOR
-; SI-CHECK: {{^}}v2f32:
-; SI-CHECK: v_floor_f32_e32
-; SI-CHECK: v_floor_f32_e32
-define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.floor.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; R600-CHECK: {{^}}v4f32:
-; R600-CHECK: FLOOR
-; R600-CHECK: FLOOR
-; R600-CHECK: FLOOR
-; R600-CHECK: FLOOR
-; SI-CHECK: {{^}}v4f32:
-; SI-CHECK: v_floor_f32_e32
-; SI-CHECK: v_floor_f32_e32
-; SI-CHECK: v_floor_f32_e32
-; SI-CHECK: v_floor_f32_e32
-define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-; Function Attrs: nounwind readonly
-declare float @llvm.floor.f32(float) #0
-
-; Function Attrs: nounwind readonly
-declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0
-
-; Function Attrs: nounwind readonly
-declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0
-
-attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll
index d6f5f6275acf..e491732cf9c5 100644
--- a/test/CodeGen/R600/llvm.memcpy.ll
+++ b/test/CodeGen/R600/llvm.memcpy.ll
@@ -7,39 +7,23 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 
 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
-
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
+
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
-
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
-; SI: ds_write_b8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 
-
 ; SI: ds_read_u8
 ; SI: ds_read_u8
 ; SI: ds_read_u8
@@ -66,6 +50,14 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
 ; SI: ds_write_b8
 
 ; SI: ds_write_b8
@@ -76,6 +68,14 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
 ; SI: ds_write_b8
 
 ; SI: s_endpgm
diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll
index 2c926341f78a..c63fb1727940 100644
--- a/test/CodeGen/R600/llvm.rint.f64.ll
+++ b/test/CodeGen/R600/llvm.rint.f64.ll
@@ -1,3 +1,4 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/R600/llvm.round.f64.ll b/test/CodeGen/R600/llvm.round.f64.ll
new file mode 100644
index 000000000000..3d0f57e33280
--- /dev/null
+++ b/test/CodeGen/R600/llvm.round.f64.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}round_f64:
+; SI: s_endpgm
+define void @round_f64(double addrspace(1)* %out, double %x) #0 {
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; This is a pretty large function, so just test a few of the
+; instructions that are necessary.
+
+; FUNC-LABEL: {{^}}v_round_f64:
+; SI: buffer_load_dwordx2
+; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
+
+; SI-DAG: v_not_b32_e32
+; SI-DAG: v_not_b32_e32
+
+; SI-DAG: v_cmp_eq_i32
+
+; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
+; SI-DAG: v_cmp_gt_i32_e64
+; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
+
+; SI-DAG: v_cmp_gt_i32_e64
+
+
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %x = load double, double addrspace(1)* %gep
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v2f64:
+; SI: s_endpgm
+define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
+  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v4f64:
+; SI: s_endpgm
+define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
+  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v8f64:
+; SI: s_endpgm
+define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
+  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
+  store <8 x double> %result, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+declare double @llvm.round.f64(double) #1
+declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
+declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
+declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.round.ll b/test/CodeGen/R600/llvm.round.ll
index bedf4ba72ae4..f5f124d915a5 100644
--- a/test/CodeGen/R600/llvm.round.ll
+++ b/test/CodeGen/R600/llvm.round.ll
@@ -1,17 +1,28 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
-
-; FUNC-LABEL: {{^}}f32:
-; R600: FRACT {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
-; R600-DAG: ADD  {{.*}}, -0.5
-; R600-DAG: CEIL {{.*}} [[ARG]]
-; R600-DAG: FLOOR {{.*}} [[ARG]]
-; R600-DAG: CNDGE
-; R600-DAG: CNDGT
-; R600: CNDGE {{[^,]+}}, [[ARG]]
-define void @f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.round.f32(float %in)
-  store float %0, float addrspace(1)* %out
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}round_f32:
+; SI-DAG: s_load_dword [[SX:s[0-9]+]]
+; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
+; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
+; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]|
+; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; SI: buffer_store_dword [[RESULT]]
+
+; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
+; R600-DAG: ADD  {{.*}},
+; R600-DAG: BFI_INT
+; R600-DAG: SETGE
+; R600-DAG: CNDE
+; R600-DAG: ADD
+define void @round_f32(float addrspace(1)* %out, float %x) #0 {
+  %result = call float @llvm.round.f32(float %x) #1
+  store float %result, float addrspace(1)* %out
   ret void
 }
 
@@ -20,24 +31,37 @@ entry:
 ; a test for the scalar case, so the vector tests just check that the
 ; compiler doesn't crash.
 
-; FUNC-LABEL: v2f32
+; FUNC-LABEL: {{^}}round_v2f32:
+; SI: s_endpgm
 ; R600: CF_END
-define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.round.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
+  %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: v4f32
+; FUNC-LABEL: {{^}}round_v4f32:
+; SI: s_endpgm
 ; R600: CF_END
-define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.round.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
+define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
+  %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
 
-declare float @llvm.round.f32(float)
-declare <2 x float> @llvm.round.v2f32(<2 x float>)
-declare <4 x float> @llvm.round.v4f32(<4 x float>)
+; FUNC-LABEL: {{^}}round_v8f32:
+; SI: s_endpgm
+; R600: CF_END
+define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
+  %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
+  store <8 x float> %result, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.round.f32(float) #1
+declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
+declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
+declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll
index 1f8df891654b..c6da047f5392 100644
--- a/test/CodeGen/R600/llvm.sqrt.ll
+++ b/test/CodeGen/R600/llvm.sqrt.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=r600 --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=r600 --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI
 
 ; R600-LABEL: {{^}}sqrt_f32:
 ; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
@@ -50,6 +50,56 @@ entry:
   ret void
 }
 
+; SI-LABEL: {{^}}elim_redun_check:
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check(float addrspace(1)* %out, float %in) {
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp olt float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check_ult:
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) {
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp ult float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check_v2:
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check_v2_ult
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 declare float @llvm.sqrt.f32(float %in)
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/llvm.trunc.ll b/test/CodeGen/R600/llvm.trunc.ll
deleted file mode 100644
index 5585477ef294..000000000000
--- a/test/CodeGen/R600/llvm.trunc.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK-LABEL: {{^}}trunc_f32:
-; CHECK: TRUNC
-
-define void @trunc_f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.trunc.f32(float %in)
-  store float %0, float  addrspace(1)* %out
-  ret void
-}
-
-declare float @llvm.trunc.f32(float)
diff --git a/test/CodeGen/R600/load-i1.ll b/test/CodeGen/R600/load-i1.ll
index 315c0a37ebf3..0ca49fde3e7b 100644
--- a/test/CodeGen/R600/load-i1.ll
+++ b/test/CodeGen/R600/load-i1.ll
@@ -11,7 +11,7 @@
 ; EG: VTX_READ_8
 ; EG: AND_INT
 define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   store i1 %load, i1 addrspace(1)* %out, align 1
   ret void
 }
@@ -26,7 +26,7 @@ define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) n
 ; EG: AND_INT
 ; EG: LDS_BYTE_WRITE
 define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind {
-  %load = load i1 addrspace(3)* %in
+  %load = load i1, i1 addrspace(3)* %in
   store i1 %load, i1 addrspace(3)* %out, align 1
   ret void
 }
@@ -40,7 +40,7 @@ define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) no
 ; EG: VTX_READ_8
 ; EG: AND_INT
 define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind {
-  %load = load i1 addrspace(2)* %in
+  %load = load i1, i1 addrspace(2)* %in
   store i1 %load, i1 addrspace(1)* %out, align 1
   ret void
 }
@@ -54,7 +54,7 @@ define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in)
 ; EG: VTX_READ_8
 ; EG: BFE_INT
 define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -66,7 +66,7 @@ define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; SI: s_endpgm
 
 define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -78,7 +78,7 @@ define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i64
   store i64 %ext, i64 addrspace(1)* %out, align 4
   ret void
@@ -90,7 +90,7 @@ define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i64
   store i64 %ext, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/load-input-fold.ll b/test/CodeGen/R600/load-input-fold.ll
index 265fa9bfeb42..1daf0e6527b9 100644
--- a/test/CodeGen/R600/load-input-fold.ll
+++ b/test/CodeGen/R600/load-input-fold.ll
@@ -14,71 +14,71 @@ main_body:
   %9 = extractelement <4 x float> %reg3, i32 1
   %10 = extractelement <4 x float> %reg3, i32 2
   %11 = extractelement <4 x float> %reg3, i32 3
-  %12 = load <4 x float> addrspace(8)* null
+  %12 = load <4 x float>, <4 x float> addrspace(8)* null
   %13 = extractelement <4 x float> %12, i32 0
   %14 = fmul float %0, %13
-  %15 = load <4 x float> addrspace(8)* null
+  %15 = load <4 x float>, <4 x float> addrspace(8)* null
   %16 = extractelement <4 x float> %15, i32 1
   %17 = fmul float %0, %16
-  %18 = load <4 x float> addrspace(8)* null
+  %18 = load <4 x float>, <4 x float> addrspace(8)* null
   %19 = extractelement <4 x float> %18, i32 2
   %20 = fmul float %0, %19
-  %21 = load <4 x float> addrspace(8)* null
+  %21 = load <4 x float>, <4 x float> addrspace(8)* null
   %22 = extractelement <4 x float> %21, i32 3
   %23 = fmul float %0, %22
-  %24 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %25 = extractelement <4 x float> %24, i32 0
   %26 = fmul float %1, %25
   %27 = fadd float %26, %14
-  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %29 = extractelement <4 x float> %28, i32 1
   %30 = fmul float %1, %29
   %31 = fadd float %30, %17
-  %32 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %33 = extractelement <4 x float> %32, i32 2
   %34 = fmul float %1, %33
   %35 = fadd float %34, %20
-  %36 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %37 = extractelement <4 x float> %36, i32 3
   %38 = fmul float %1, %37
   %39 = fadd float %38, %23
-  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %41 = extractelement <4 x float> %40, i32 0
   %42 = fmul float %2, %41
   %43 = fadd float %42, %27
-  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %45 = extractelement <4 x float> %44, i32 1
   %46 = fmul float %2, %45
   %47 = fadd float %46, %31
-  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %49 = extractelement <4 x float> %48, i32 2
   %50 = fmul float %2, %49
   %51 = fadd float %50, %35
-  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %53 = extractelement <4 x float> %52, i32 3
   %54 = fmul float %2, %53
   %55 = fadd float %54, %39
-  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %57 = extractelement <4 x float> %56, i32 0
   %58 = fmul float %3, %57
   %59 = fadd float %58, %43
-  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %61 = extractelement <4 x float> %60, i32 1
   %62 = fmul float %3, %61
   %63 = fadd float %62, %47
-  %64 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %65 = extractelement <4 x float> %64, i32 2
   %66 = fmul float %3, %65
   %67 = fadd float %66, %51
-  %68 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %69 = extractelement <4 x float> %68, i32 3
   %70 = fmul float %3, %69
   %71 = fadd float %70, %55
-  %72 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %73 = extractelement <4 x float> %72, i32 0
-  %74 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %75 = extractelement <4 x float> %74, i32 1
-  %76 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %77 = extractelement <4 x float> %76, i32 2
   %78 = insertelement <4 x float> undef, float %4, i32 0
   %79 = insertelement <4 x float> %78, float %5, i32 1
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index b71b7cb24c49..93b1b51a0d07 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -13,7 +13,7 @@
 
 ; SI: buffer_load_ubyte v{{[0-9]+}},
 define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %1 = load i8 addrspace(1)* %in
+  %1 = load i8, i8 addrspace(1)* %in
   %2 = zext i8 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
   ret void
@@ -21,14 +21,12 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}load_i8_sext:
 ; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
-; R600: 24
-; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
-; R600: 24
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 8
 ; SI: buffer_load_sbyte
 define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
-  %0 = load i8 addrspace(1)* %in
+  %0 = load i8, i8 addrspace(1)* %in
   %1 = sext i8 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -41,7 +39,7 @@ entry:
 ; SI: buffer_load_ubyte
 define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
-  %0 = load <2 x i8> addrspace(1)* %in
+  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %1 = zext <2 x i8> %0 to <2 x i32>
   store <2 x i32> %1, <2 x i32> addrspace(1)* %out
   ret void
@@ -50,19 +48,16 @@ entry:
 ; FUNC-LABEL: {{^}}load_v2i8_sext:
 ; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
-; R600-DAG: 24
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
-; R600-DAG: 24
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
-; R600-DAG: 24
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
-; R600-DAG: 24
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: 8
+; R600-DAG: 8
+
 ; SI: buffer_load_sbyte
 ; SI: buffer_load_sbyte
 define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
-  %0 = load <2 x i8> addrspace(1)* %in
+  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %1 = sext <2 x i8> %0 to <2 x i32>
   store <2 x i32> %1, <2 x i32> addrspace(1)* %out
   ret void
@@ -79,7 +74,7 @@ entry:
 ; SI: buffer_load_ubyte
 define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
-  %0 = load <4 x i8> addrspace(1)* %in
+  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %1 = zext <4 x i8> %0 to <4 x i32>
   store <4 x i32> %1, <4 x i32> addrspace(1)* %out
   ret void
@@ -90,29 +85,21 @@ entry:
 ; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
 ; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
-; R600-DAG: 24
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
-; R600-DAG: 24
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
-; R600-DAG: 24
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
-; R600-DAG: 24
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
-; R600-DAG: 24
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
-; R600-DAG: 24
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
-; R600-DAG: 24
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
-; R600-DAG: 24
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; R600-DAG: 8
+; R600-DAG: 8
+; R600-DAG: 8
+; R600-DAG: 8
 ; SI: buffer_load_sbyte
 ; SI: buffer_load_sbyte
 ; SI: buffer_load_sbyte
 ; SI: buffer_load_sbyte
 define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
-  %0 = load <4 x i8> addrspace(1)* %in
+  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %1 = sext <4 x i8> %0 to <4 x i32>
   store <4 x i32> %1, <4 x i32> addrspace(1)* %out
   ret void
@@ -124,7 +111,7 @@ entry:
 ; SI: buffer_load_ushort
 define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
-  %0 = load i16	 addrspace(1)* %in
+  %0 = load i16	, i16	 addrspace(1)* %in
   %1 = zext i16 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -132,14 +119,12 @@ entry:
 
 ; FUNC-LABEL: {{^}}load_i16_sext:
 ; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
-; R600: 16
-; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; R600: 16
 ; SI: buffer_load_sshort
 define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
-  %0 = load i16 addrspace(1)* %in
+  %0 = load i16, i16 addrspace(1)* %in
   %1 = sext i16 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -152,7 +137,7 @@ entry:
 ; SI: buffer_load_ushort
 define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
-  %0 = load <2 x i16> addrspace(1)* %in
+  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %1 = zext <2 x i16> %0 to <2 x i32>
   store <2 x i32> %1, <2 x i32> addrspace(1)* %out
   ret void
@@ -161,19 +146,15 @@ entry:
 ; FUNC-LABEL: {{^}}load_v2i16_sext:
 ; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
-; R600-DAG: 16
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
 ; R600-DAG: 16
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
-; R600-DAG: 16
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
 ; R600-DAG: 16
 ; SI: buffer_load_sshort
 ; SI: buffer_load_sshort
 define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
-  %0 = load <2 x i16> addrspace(1)* %in
+  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %1 = sext <2 x i16> %0 to <2 x i32>
   store <2 x i32> %1, <2 x i32> addrspace(1)* %out
   ret void
@@ -190,7 +171,7 @@ entry:
 ; SI: buffer_load_ushort
 define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
-  %0 = load <4 x i16> addrspace(1)* %in
+  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %1 = zext <4 x i16> %0 to <4 x i32>
   store <4 x i32> %1, <4 x i32> addrspace(1)* %out
   ret void
@@ -201,21 +182,13 @@ entry:
 ; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
 ; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
-; R600-DAG: 16
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
 ; R600-DAG: 16
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
 ; R600-DAG: 16
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
 ; R600-DAG: 16
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
-; R600-DAG: 16
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
-; R600-DAG: 16
-; R600-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
-; R600-DAG: 16
-; R600-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
 ; R600-DAG: 16
 ; SI: buffer_load_sshort
 ; SI: buffer_load_sshort
@@ -223,7 +196,7 @@ entry:
 ; SI: buffer_load_sshort
 define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
-  %0 = load <4 x i16> addrspace(1)* %in
+  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %1 = sext <4 x i16> %0 to <4 x i32>
   store <4 x i32> %1, <4 x i32> addrspace(1)* %out
   ret void
@@ -236,7 +209,7 @@ entry:
 ; SI: buffer_load_dword v{{[0-9]+}}
 define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %0 = load i32 addrspace(1)* %in
+  %0 = load i32, i32 addrspace(1)* %in
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
@@ -248,7 +221,7 @@ entry:
 ; SI: buffer_load_dword v{{[0-9]+}}
 define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
-  %0 = load float addrspace(1)* %in
+  %0 = load float, float addrspace(1)* %in
   store float %0, float addrspace(1)* %out
   ret void
 }
@@ -260,7 +233,7 @@ entry:
 ; SI: buffer_load_dwordx2
 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 entry:
-  %0 = load <2 x float> addrspace(1)* %in
+  %0 = load <2 x float>, <2 x float> addrspace(1)* %in
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
@@ -270,7 +243,7 @@ entry:
 ; SI: buffer_load_dwordx2
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
-  %0 = load i64 addrspace(1)* %in
+  %0 = load i64, i64 addrspace(1)* %in
   store i64 %0, i64 addrspace(1)* %out
   ret void
 }
@@ -284,7 +257,7 @@ entry:
 
 define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %0 = load i32 addrspace(1)* %in
+  %0 = load i32, i32 addrspace(1)* %in
   %1 = sext i32 %0 to i64
   store i64 %1, i64 addrspace(1)* %out
   ret void
@@ -295,7 +268,7 @@ entry:
 ; R600: MEM_RAT
 define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %0 = load i32 addrspace(1)* %in
+  %0 = load i32, i32 addrspace(1)* %in
   %1 = zext i32 %0 to i64
   store i64 %1, i64 addrspace(1)* %out
   ret void
@@ -315,7 +288,7 @@ entry:
 ; SI: buffer_load_dword
 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
 entry:
-  %0 = load <8 x i32> addrspace(1)* %in
+  %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
   store <8 x i32> %0, <8 x i32> addrspace(1)* %out
   ret void
 }
@@ -344,7 +317,7 @@ entry:
 ; SI: buffer_load_dword
 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
 entry:
-  %0 = load <16 x i32> addrspace(1)* %in
+  %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
   store <16 x i32> %0, <16 x i32> addrspace(1)* %out
   ret void
 }
@@ -356,14 +329,12 @@ entry:
 ; Load a sign-extended i8 value
 ; FUNC-LABEL: {{^}}load_const_i8_sext:
 ; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
-; R600: 24
-; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
-; R600: 24
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 8
 ; SI: buffer_load_sbyte v{{[0-9]+}},
 define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
-  %0 = load i8 addrspace(2)* %in
+  %0 = load i8, i8 addrspace(2)* %in
   %1 = sext i8 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -375,7 +346,7 @@ entry:
 ; SI: buffer_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
-  %0 = load i8 addrspace(2)* %in
+  %0 = load i8, i8 addrspace(2)* %in
   %1 = zext i8 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -387,8 +358,8 @@ entry:
 ; SI: buffer_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
-  %0 = getelementptr i8 addrspace(2)* %in, i32 1
-  %1 = load i8 addrspace(2)* %0
+  %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1
+  %1 = load i8, i8 addrspace(2)* %0
   %2 = zext i8 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
   ret void
@@ -397,14 +368,12 @@ entry:
 ; Load a sign-extended i16 value
 ; FUNC-LABEL: {{^}}load_const_i16_sext:
 ; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
-; R600: 16
-; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; R600: 16
 ; SI: buffer_load_sshort
 define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
-  %0 = load i16 addrspace(2)* %in
+  %0 = load i16, i16 addrspace(2)* %in
   %1 = sext i16 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -416,7 +385,7 @@ entry:
 ; SI: buffer_load_ushort
 define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
-  %0 = load i16 addrspace(2)* %in
+  %0 = load i16, i16 addrspace(2)* %in
   %1 = zext i16 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -428,8 +397,8 @@ entry:
 ; SI: buffer_load_ushort
 define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
-  %0 = getelementptr i16 addrspace(2)* %in, i32 1
-  %1 = load i16 addrspace(2)* %0
+  %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1
+  %1 = load i16, i16 addrspace(2)* %0
   %2 = zext i16 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
   ret void
@@ -442,7 +411,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]+}}
 define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
-  %0 = load i32 addrspace(2)* %in
+  %0 = load i32, i32 addrspace(2)* %in
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
@@ -453,7 +422,7 @@ entry:
 
 ; SI: s_load_dword s{{[0-9]+}}
 define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
-  %1 = load float addrspace(2)* %in
+  %1 = load float, float addrspace(2)* %in
   store float %1, float addrspace(1)* %out
   ret void
 }
@@ -469,7 +438,7 @@ define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(
 ; SI: s_mov_b32 m0
 ; SI: ds_read_u8
 define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
-  %1 = load i8 addrspace(3)* %in
+  %1 = load i8, i8 addrspace(3)* %in
   %2 = zext i8 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
   ret void
@@ -477,13 +446,13 @@ define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
 
 ; FUNC-LABEL: {{^}}load_i8_sext_local:
 ; R600: LDS_UBYTE_READ_RET
-; R600: ASHR
+; R600: BFE_INT
 ; SI-NOT: s_wqm_b64
 ; SI: s_mov_b32 m0
 ; SI: ds_read_i8
 define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
 entry:
-  %0 = load i8 addrspace(3)* %in
+  %0 = load i8, i8 addrspace(3)* %in
   %1 = sext i8 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -498,7 +467,7 @@ entry:
 ; SI: ds_read_u8
 define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
 entry:
-  %0 = load <2 x i8> addrspace(3)* %in
+  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %1 = zext <2 x i8> %0 to <2 x i32>
   store <2 x i32> %1, <2 x i32> addrspace(1)* %out
   ret void
@@ -507,15 +476,15 @@ entry:
 ; FUNC-LABEL: {{^}}load_v2i8_sext_local:
 ; R600-DAG: LDS_UBYTE_READ_RET
 ; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: ASHR
-; R600-DAG: ASHR
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
 ; SI-NOT: s_wqm_b64
 ; SI: s_mov_b32 m0
 ; SI: ds_read_i8
 ; SI: ds_read_i8
 define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
 entry:
-  %0 = load <2 x i8> addrspace(3)* %in
+  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %1 = sext <2 x i8> %0 to <2 x i32>
   store <2 x i32> %1, <2 x i32> addrspace(1)* %out
   ret void
@@ -534,7 +503,7 @@ entry:
 ; SI: ds_read_u8
 define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
 entry:
-  %0 = load <4 x i8> addrspace(3)* %in
+  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %1 = zext <4 x i8> %0 to <4 x i32>
   store <4 x i32> %1, <4 x i32> addrspace(1)* %out
   ret void
@@ -545,10 +514,10 @@ entry:
 ; R600-DAG: LDS_UBYTE_READ_RET
 ; R600-DAG: LDS_UBYTE_READ_RET
 ; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: ASHR
-; R600-DAG: ASHR
-; R600-DAG: ASHR
-; R600-DAG: ASHR
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
 ; SI-NOT: s_wqm_b64
 ; SI: s_mov_b32 m0
 ; SI: ds_read_i8
@@ -557,7 +526,7 @@ entry:
 ; SI: ds_read_i8
 define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
 entry:
-  %0 = load <4 x i8> addrspace(3)* %in
+  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %1 = sext <4 x i8> %0 to <4 x i32>
   store <4 x i32> %1, <4 x i32> addrspace(1)* %out
   ret void
@@ -571,7 +540,7 @@ entry:
 ; SI: ds_read_u16
 define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
-  %0 = load i16	 addrspace(3)* %in
+  %0 = load i16	, i16	 addrspace(3)* %in
   %1 = zext i16 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -579,13 +548,13 @@ entry:
 
 ; FUNC-LABEL: {{^}}load_i16_sext_local:
 ; R600: LDS_USHORT_READ_RET
-; R600: ASHR
+; R600: BFE_INT
 ; SI-NOT: s_wqm_b64
 ; SI: s_mov_b32 m0
 ; SI: ds_read_i16
 define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
-  %0 = load i16 addrspace(3)* %in
+  %0 = load i16, i16 addrspace(3)* %in
   %1 = sext i16 %0 to i32
   store i32 %1, i32 addrspace(1)* %out
   ret void
@@ -600,7 +569,7 @@ entry:
 ; SI: ds_read_u16
 define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
-  %0 = load <2 x i16> addrspace(3)* %in
+  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %1 = zext <2 x i16> %0 to <2 x i32>
   store <2 x i32> %1, <2 x i32> addrspace(1)* %out
   ret void
@@ -609,15 +578,15 @@ entry:
 ; FUNC-LABEL: {{^}}load_v2i16_sext_local:
 ; R600-DAG: LDS_USHORT_READ_RET
 ; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: ASHR
-; R600-DAG: ASHR
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
 ; SI-NOT: s_wqm_b64
 ; SI: s_mov_b32 m0
 ; SI: ds_read_i16
 ; SI: ds_read_i16
 define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
-  %0 = load <2 x i16> addrspace(3)* %in
+  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %1 = sext <2 x i16> %0 to <2 x i32>
   store <2 x i32> %1, <2 x i32> addrspace(1)* %out
   ret void
@@ -636,7 +605,7 @@ entry:
 ; SI: ds_read_u16
 define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
-  %0 = load <4 x i16> addrspace(3)* %in
+  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %1 = zext <4 x i16> %0 to <4 x i32>
   store <4 x i32> %1, <4 x i32> addrspace(1)* %out
   ret void
@@ -647,10 +616,10 @@ entry:
 ; R600-DAG: LDS_USHORT_READ_RET
 ; R600-DAG: LDS_USHORT_READ_RET
 ; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: ASHR
-; R600-DAG: ASHR
-; R600-DAG: ASHR
-; R600-DAG: ASHR
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
 ; SI-NOT: s_wqm_b64
 ; SI: s_mov_b32 m0
 ; SI: ds_read_i16
@@ -659,7 +628,7 @@ entry:
 ; SI: ds_read_i16
 define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
-  %0 = load <4 x i16> addrspace(3)* %in
+  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %1 = sext <4 x i16> %0 to <4 x i32>
   store <4 x i32> %1, <4 x i32> addrspace(1)* %out
   ret void
@@ -673,7 +642,7 @@ entry:
 ; SI: ds_read_b32
 define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
-  %0 = load i32 addrspace(3)* %in
+  %0 = load i32, i32 addrspace(3)* %in
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
@@ -685,7 +654,7 @@ entry:
 ; SI: ds_read_b32
 define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
 entry:
-  %0 = load float addrspace(3)* %in
+  %0 = load float, float addrspace(3)* %in
   store float %0, float addrspace(1)* %out
   ret void
 }
@@ -698,7 +667,7 @@ entry:
 ; SI: ds_read_b64
 define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
 entry:
-  %0 = load <2 x float> addrspace(3)* %in
+  %0 = load <2 x float>, <2 x float> addrspace(3)* %in
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
@@ -711,10 +680,10 @@ entry:
 ; SI-DAG: ds_read_b32
 ; SI-DAG: ds_read2_b32
 define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
-  %scalar = load i32 addrspace(3)* %in
+  %scalar = load i32, i32 addrspace(3)* %in
   %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
-  %vec_ptr = getelementptr <2 x i32> addrspace(3)* %tmp0, i32 2
-  %vec0 = load <2 x i32> addrspace(3)* %vec_ptr, align 4
+  %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
+  %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4
   %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
   %vec = add <2 x i32> %vec0, %vec1
   store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
@@ -732,9 +701,9 @@ define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)
 ; R600: LDS_READ_RET
 define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
-  %tmp0 = getelementptr [512 x i32] addrspace(3)* @lds, i32 0, i32 1
-  %tmp1 = load i32 addrspace(3)* %tmp0
-  %tmp2 = getelementptr i32 addrspace(1)* %out, i32 1
+  %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
+  %tmp1 = load i32, i32 addrspace(3)* %tmp0
+  %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   store i32 %tmp1, i32 addrspace(1)* %tmp2
   ret void
 }
diff --git a/test/CodeGen/R600/load.vec.ll b/test/CodeGen/R600/load.vec.ll
index 346d8dc0c6e4..02f883cd8e9c 100644
--- a/test/CodeGen/R600/load.vec.ll
+++ b/test/CodeGen/R600/load.vec.ll
@@ -8,7 +8,7 @@
 ; SI: {{^}}load_v2i32:
 ; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %a = load <2 x i32> addrspace(1) * %in
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   store <2 x i32> %a, <2 x i32> addrspace(1)* %out
   ret void
 }
@@ -19,7 +19,7 @@ define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; SI: {{^}}load_v4i32:
 ; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
 define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %a = load <4 x i32> addrspace(1) * %in
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   store <4 x i32> %a, <4 x i32> addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll
index cb3d65466061..74beabdc0076 100644
--- a/test/CodeGen/R600/load64.ll
+++ b/test/CodeGen/R600/load64.ll
@@ -6,7 +6,7 @@
 ; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
 ; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-  %1 = load double addrspace(1)* %in
+  %1 = load double, double addrspace(1)* %in
   store double %1, double addrspace(1)* %out
   ret void
 }
@@ -15,7 +15,7 @@ define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
 ; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tmp = load i64 addrspace(1)* %in
+  %tmp = load i64, i64 addrspace(1)* %in
   store i64 %tmp, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -25,7 +25,7 @@ define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
 ; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
-  %1 = load double addrspace(2)* %in
+  %1 = load double, double addrspace(2)* %in
   store double %1, double addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/local-64.ll b/test/CodeGen/R600/local-64.ll
index f975bc1f56b0..33f3159d13eb 100644
--- a/test/CodeGen/R600/local-64.ll
+++ b/test/CodeGen/R600/local-64.ll
@@ -1,32 +1,33 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
 
 ; BOTH-LABEL: {{^}}local_i32_load
-; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 [M0]
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
 ; BOTH: buffer_store_dword [[REG]],
 define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %in, i32 7
-  %val = load i32 addrspace(3)* %gep, align 4
+  %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %val = load i32, i32 addrspace(3)* %gep, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_i32_load_0_offset
-; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} [M0]
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
 ; BOTH: buffer_store_dword [[REG]],
 define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
-  %val = load i32 addrspace(3)* %in, align 4
+  %val = load i32, i32 addrspace(3)* %in, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset:
 ; BOTH-NOT: ADD
-; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 [M0]
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
 ; BOTH: buffer_store_byte [[REG]],
 define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i8 addrspace(3)* %in, i32 65535
-  %val = load i8 addrspace(3)* %gep, align 4
+  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
+  %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
   ret void
 }
@@ -37,67 +38,67 @@ define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
-; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] [M0]
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
 ; BOTH: buffer_store_byte [[REG]],
 define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i8 addrspace(3)* %in, i32 65536
-  %val = load i8 addrspace(3)* %gep, align 4
+  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
+  %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_i64_load:
 ; BOTH-NOT: ADD
-; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 [M0]
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %in, i32 7
-  %val = load i64 addrspace(3)* %gep, align 8
+  %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
+  %val = load i64, i64 addrspace(3)* %gep, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_i64_load_0_offset
-; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} [M0]
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
-  %val = load i64 addrspace(3)* %in, align 8
+  %val = load i64, i64 addrspace(3)* %in, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_f64_load:
 ; BOTH-NOT: ADD
-; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 [M0]
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
-  %gep = getelementptr double addrspace(3)* %in, i32 7
-  %val = load double addrspace(3)* %gep, align 8
+  %gep = getelementptr double, double addrspace(3)* %in, i32 7
+  %val = load double, double addrspace(3)* %gep, align 8
   store double %val, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_f64_load_0_offset
-; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} [M0]
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
-  %val = load double addrspace(3)* %in, align 8
+  %val = load double, double addrspace(3)* %in, align 8
   store double %val, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_i64_store:
 ; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 [M0]
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
 define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %out, i32 7
+  %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
 define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
@@ -105,15 +106,15 @@ define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_f64_store:
 ; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 [M0]
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
 define void @local_f64_store(double addrspace(3)* %out) nounwind {
-  %gep = getelementptr double addrspace(3)* %out, i32 7
+  %gep = getelementptr double, double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_f64_store_0_offset
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
 define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
@@ -121,19 +122,19 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v2i64_store:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120
 ; BOTH: s_endpgm
 define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
-  %gep = getelementptr <2 x i64> addrspace(3)* %out, i32 7
+  %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
   store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
 ; BOTH: s_endpgm
 define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
@@ -142,23 +143,23 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v4i64_store:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248
 ; BOTH: s_endpgm
 define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
-  %gep = getelementptr <4 x i64> addrspace(3)* %out, i32 7
+  %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
   store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
   ret void
 }
 
 ; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 [M0]
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
 ; BOTH: s_endpgm
 define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
index 16d3173f3692..2aaf977ab903 100644
--- a/test/CodeGen/R600/local-atomics.ll
+++ b/test/CodeGen/R600/local-atomics.ll
@@ -1,15 +1,16 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
 ; EG: LDS_WRXCHG_RET *
-; SI: s_load_dword [[SPTR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -18,10 +19,10 @@ define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset:
 ; EG: LDS_WRXCHG_RET *
-; SI: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -30,12 +31,12 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
 ; EG: LDS_ADD_RET *
-; SI: s_load_dword [[SPTR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -44,10 +45,10 @@ define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset:
 ; EG: LDS_ADD_RET *
-; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -55,13 +56,13 @@ define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset:
 ; EG: LDS_ADD_RET *
-; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} [M0]
-; CI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -69,9 +70,9 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
 ; EG: LDS_ADD_RET *
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -80,11 +81,11 @@ define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
 ; EG: LDS_ADD_RET *
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -92,13 +93,13 @@ define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset:
 ; EG: LDS_ADD_RET *
-; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} [M0]
-; CI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -106,8 +107,8 @@ define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32:
 ; EG: LDS_SUB_RET *
-; SI: ds_sub_rtn_u32
-; SI: s_endpgm
+; GCN: ds_sub_rtn_u32
+; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -116,10 +117,10 @@ define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset:
 ; EG: LDS_SUB_RET *
-; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -127,9 +128,9 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
 ; EG: LDS_SUB_RET *
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; SI: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
 define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -138,11 +139,11 @@ define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
 ; EG: LDS_SUB_RET *
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; SI: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -150,8 +151,8 @@ define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32:
 ; EG: LDS_AND_RET *
-; SI: ds_and_rtn_b32
-; SI: s_endpgm
+; GCN: ds_and_rtn_b32
+; GCN: s_endpgm
 define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -160,10 +161,10 @@ define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset:
 ; EG: LDS_AND_RET *
-; SI: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -171,8 +172,8 @@ define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32:
 ; EG: LDS_OR_RET *
-; SI: ds_or_rtn_b32
-; SI: s_endpgm
+; GCN: ds_or_rtn_b32
+; GCN: s_endpgm
 define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -181,10 +182,10 @@ define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %pt
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset:
 ; EG: LDS_OR_RET *
-; SI: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -192,8 +193,8 @@ define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32:
 ; EG: LDS_XOR_RET *
-; SI: ds_xor_rtn_b32
-; SI: s_endpgm
+; GCN: ds_xor_rtn_b32
+; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -202,10 +203,10 @@ define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset:
 ; EG: LDS_XOR_RET *
-; SI: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -221,8 +222,8 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32:
 ; EG: LDS_MIN_INT_RET *
-; SI: ds_min_rtn_i32
-; SI: s_endpgm
+; GCN: ds_min_rtn_i32
+; GCN: s_endpgm
 define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -231,10 +232,10 @@ define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset:
 ; EG: LDS_MIN_INT_RET *
-; SI: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -242,8 +243,8 @@ define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32:
 ; EG: LDS_MAX_INT_RET *
-; SI: ds_max_rtn_i32
-; SI: s_endpgm
+; GCN: ds_max_rtn_i32
+; GCN: s_endpgm
 define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -252,10 +253,10 @@ define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset:
 ; EG: LDS_MAX_INT_RET *
-; SI: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -263,8 +264,8 @@ define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32:
 ; EG: LDS_MIN_UINT_RET *
-; SI: ds_min_rtn_u32
-; SI: s_endpgm
+; GCN: ds_min_rtn_u32
+; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -273,10 +274,10 @@ define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset:
 ; EG: LDS_MIN_UINT_RET *
-; SI: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -284,8 +285,8 @@ define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32:
 ; EG: LDS_MAX_UINT_RET *
-; SI: ds_max_rtn_u32
-; SI: s_endpgm
+; GCN: ds_max_rtn_u32
+; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -294,182 +295,182 @@ define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset:
 ; EG: LDS_MAX_UINT_RET *
-; SI: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32:
-; SI: s_load_dword [[SPTR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
-; SI: s_endpgm
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
-; SI: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
 
 ; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
-; SI: s_load_dword [[SPTR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_add_u32 [[VPTR]], [[DATA]] [M0]
-; SI: s_endpgm
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_add_u32 [[VPTR]], [[DATA]]
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
-; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset
-; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} [M0]
-; CI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 [M0]
-; SI: s_endpgm
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] [M0]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset:
 ; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}}
-; CI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
-; SI: ds_sub_u32
-; SI: s_endpgm
+; GCN: ds_sub_u32
+; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
-; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; SI: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
 define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
-; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; SI: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
 define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
-; SI: ds_and_b32
-; SI: s_endpgm
+; GCN: ds_and_b32
+; GCN: s_endpgm
 define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
-; SI: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
-; SI: ds_or_b32
-; SI: s_endpgm
+; GCN: ds_or_b32
+; GCN: s_endpgm
 define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
-; SI: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
-; SI: ds_xor_b32
-; SI: s_endpgm
+; GCN: ds_xor_b32
+; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
-; SI: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
@@ -482,69 +483,69 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
-; SI: ds_min_i32
-; SI: s_endpgm
+; GCN: ds_min_i32
+; GCN: s_endpgm
 define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
-; SI: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
-; SI: ds_max_i32
-; SI: s_endpgm
+; GCN: ds_max_i32
+; GCN: s_endpgm
 define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
-; SI: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
-; SI: ds_min_u32
-; SI: s_endpgm
+; GCN: ds_min_u32
+; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
-; SI: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
-; SI: ds_max_u32
-; SI: s_endpgm
+; GCN: ds_max_u32
+; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
-; SI: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; SI: s_endpgm
+; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
 }
diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
index ce6ddbd66265..0ffa5e751b7d 100644
--- a/test/CodeGen/R600/local-atomics64.ll
+++ b/test/CodeGen/R600/local-atomics64.ll
@@ -1,8 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64:
-; SI: ds_wrxchg_rtn_b64
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -10,18 +11,18 @@ define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
-; SI: ds_wrxchg_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64:
-; SI: ds_add_rtn_u64
-; SI: s_endpgm
+; GCN: ds_add_rtn_u64
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -29,26 +30,27 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
 ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
-; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
-; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
-; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; SI: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -56,18 +58,18 @@ define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
-; SI: ds_inc_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_inc_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64:
-; SI: ds_sub_rtn_u64
-; SI: s_endpgm
+; GCN: ds_sub_rtn_u64
+; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -75,21 +77,21 @@ define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
-; SI: ds_sub_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_sub_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
-; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; SI: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]],
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
 define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -97,18 +99,18 @@ define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
-; SI: ds_dec_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_dec_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64:
-; SI: ds_and_rtn_b64
-; SI: s_endpgm
+; GCN: ds_and_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -116,18 +118,18 @@ define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
-; SI: ds_and_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_and_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64:
-; SI: ds_or_rtn_b64
-; SI: s_endpgm
+; GCN: ds_or_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -135,18 +137,18 @@ define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %pt
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
-; SI: ds_or_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_or_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64:
-; SI: ds_xor_rtn_b64
-; SI: s_endpgm
+; GCN: ds_xor_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -154,10 +156,10 @@ define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
-; SI: ds_xor_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_xor_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -172,8 +174,8 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64:
-; SI: ds_min_rtn_i64
-; SI: s_endpgm
+; GCN: ds_min_rtn_i64
+; GCN: s_endpgm
 define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -181,18 +183,18 @@ define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
-; SI: ds_min_rtn_i64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_min_rtn_i64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64:
-; SI: ds_max_rtn_i64
-; SI: s_endpgm
+; GCN: ds_max_rtn_i64
+; GCN: s_endpgm
 define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -200,18 +202,18 @@ define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
-; SI: ds_max_rtn_i64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_max_rtn_i64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64:
-; SI: ds_min_rtn_u64
-; SI: s_endpgm
+; GCN: ds_min_rtn_u64
+; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -219,18 +221,18 @@ define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
-; SI: ds_min_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_min_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64:
-; SI: ds_max_rtn_u64
-; SI: s_endpgm
+; GCN: ds_max_rtn_u64
+; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -238,35 +240,35 @@ define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
-; SI: ds_max_rtn_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_max_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64:
-; SI: ds_wrxchg_rtn_b64
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b64
+; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
-; SI: ds_wrxchg_rtn_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64:
-; SI: ds_add_u64
-; SI: s_endpgm
+; GCN: ds_add_u64
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
@@ -274,119 +276,120 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
 ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
-; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
-; SI: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
-; SI: s_endpgm
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
-; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; SI: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
-; SI: ds_inc_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_inc_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64:
-; SI: ds_sub_u64
-; SI: s_endpgm
+; GCN: ds_sub_u64
+; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
-; SI: ds_sub_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_sub_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
-; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; SI: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: s_endpgm
 define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
-; SI: ds_dec_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_dec_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64:
-; SI: ds_and_b64
-; SI: s_endpgm
+; GCN: ds_and_b64
+; GCN: s_endpgm
 define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
-; SI: ds_and_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_and_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64:
-; SI: ds_or_b64
-; SI: s_endpgm
+; GCN: ds_or_b64
+; GCN: s_endpgm
 define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
-; SI: ds_or_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_or_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64:
-; SI: ds_xor_b64
-; SI: s_endpgm
+; GCN: ds_xor_b64
+; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
-; SI: ds_xor_b64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_xor_b64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
 }
@@ -399,69 +402,69 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64:
-; SI: ds_min_i64
-; SI: s_endpgm
+; GCN: ds_min_i64
+; GCN: s_endpgm
 define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
-; SI: ds_min_i64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_min_i64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64:
-; SI: ds_max_i64
-; SI: s_endpgm
+; GCN: ds_max_i64
+; GCN: s_endpgm
 define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
-; SI: ds_max_i64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_max_i64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64:
-; SI: ds_min_u64
-; SI: s_endpgm
+; GCN: ds_min_u64
+; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
-; SI: ds_min_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_min_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64:
-; SI: ds_max_u64
-; SI: s_endpgm
+; GCN: ds_max_u64
+; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
-; SI: ds_max_u64 {{.*}} offset:32
-; SI: s_endpgm
+; GCN: ds_max_u64 {{.*}} offset:32
+; GCN: s_endpgm
 define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
 }
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 60f1a0a4963a..06a8b1246e63 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -5,7 +5,6 @@
 @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 
-; EG: {{^}}local_memory_two_objects:
 
 ; Check that the LDS size emitted correctly
 ; EG: .long 166120
@@ -13,6 +12,8 @@
 ; GCN: .long 47180
 ; GCN-NEXT: .long 38792
 
+; EG: {{^}}local_memory_two_objects:
+
 ; We would like to check the the lds writes are using different
 ; addresses, but due to variations in the scheduler, we can't do
 ; this consistently on evergreen GPUs.
@@ -30,28 +31,28 @@
 ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 ; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] [M0]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 [M0]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] [M0]
+; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
   %x.i = call i32 @llvm.r600.read.tidig.x() #0
-  %arrayidx = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
   store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
   %mul = shl nsw i32 %x.i, 1
-  %arrayidx1 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
   store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
   %sub = sub nsw i32 3, %x.i
   call void @llvm.AMDGPU.barrier.local()
-  %arrayidx2 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
-  %0 = load i32 addrspace(3)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32 addrspace(1)* %out, i32 %x.i
+  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+  %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
   store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
-  %arrayidx4 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
-  %1 = load i32 addrspace(3)* %arrayidx4, align 4
+  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
   %add = add nsw i32 %x.i, 4
-  %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %out, i32 %add
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
   store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 68e72c556f66..9494ed75bd0c 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll
@@ -4,7 +4,6 @@
 
 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
 
-; FUNC-LABEL: {{^}}local_memory:
 
 ; Check that the LDS size emitted correctly
 ; EG: .long 166120
@@ -14,6 +13,8 @@
 ; CI: .long 47180
 ; CI-NEXT: .long 38792
 
+; FUNC-LABEL: {{^}}local_memory:
+
 ; EG: LDS_WRITE
 ; SI-NOT: s_wqm_b64
 ; SI: ds_write_b32
@@ -29,15 +30,15 @@
 define void @local_memory(i32 addrspace(1)* %out) {
 entry:
   %y.i = call i32 @llvm.r600.read.tidig.x() #0
-  %arrayidx = getelementptr inbounds [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
   store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
   %add = add nsw i32 %y.i, 1
   %cmp = icmp eq i32 %add, 16
   %.add = select i1 %cmp, i32 0, i32 %add
   call void @llvm.AMDGPU.barrier.local()
-  %arrayidx1 = getelementptr inbounds [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
-  %0 = load i32 addrspace(3)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i32 %y.i
+  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
   store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/loop-address.ll b/test/CodeGen/R600/loop-address.ll
index 03e0f011fffc..7fadb8dba7b8 100644
--- a/test/CodeGen/R600/loop-address.ll
+++ b/test/CodeGen/R600/loop-address.ll
@@ -17,7 +17,7 @@ for.body:                                         ; preds = %for.body, %entry
   %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
   %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
   %i.07 = add nsw i32 %i.07.in, -1
-  %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %ai.06
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
   store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
   %add = add nsw i32 %ai.06, 1
   %exitcond = icmp eq i32 %add, %iterations
diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll
index a0b00ab644b4..810b34fed865 100644
--- a/test/CodeGen/R600/loop-idiom.ll
+++ b/test/CodeGen/R600/loop-idiom.ll
@@ -20,9 +20,9 @@ entry:
 
 for.body:
   %0 = phi i32 [0, %entry], [%4, %for.body]
-  %1 = getelementptr i8 addrspace(3)* %in, i32 %0
-  %2 = getelementptr i8* %dest, i32 %0
-  %3 = load i8 addrspace(3)* %1
+  %1 = getelementptr i8, i8 addrspace(3)* %in, i32 %0
+  %2 = getelementptr i8, i8* %dest, i32 %0
+  %3 = load i8, i8 addrspace(3)* %1
   store i8 %3, i8* %2
   %4 = add i32 %0, 1
   %5 = icmp eq i32 %4, %size
@@ -44,7 +44,7 @@ entry:
 
 for.body:
   %0 = phi i32 [0, %entry], [%2, %for.body]
-  %1 = getelementptr i8* %dest, i32 %0
+  %1 = getelementptr i8, i8* %dest, i32 %0
   store i8 0, i8* %1
   %2 = add i32 %0, 1
   %3 = icmp eq i32 %2, %size
diff --git a/test/CodeGen/R600/m0-spill.ll b/test/CodeGen/R600/m0-spill.ll
index 4dade82325ce..1dddc85f775d 100644
--- a/test/CodeGen/R600/m0-spill.ll
+++ b/test/CodeGen/R600/m0-spill.ll
@@ -12,8 +12,8 @@ main_body:
   br i1 %cmp, label %if, label %else
 
 if:
-  %lds_ptr = getelementptr [64 x float] addrspace(3)* @lds, i32 0, i32 0
-  %lds_data = load float addrspace(3)* %lds_ptr
+  %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
+  %lds_data = load float, float addrspace(3)* %lds_ptr
   br label %endif
 
 else:
diff --git a/test/CodeGen/R600/mad-combine.ll b/test/CodeGen/R600/mad-combine.ll
new file mode 100644
index 000000000000..bc071628ead0
--- /dev/null
+++ b/test/CodeGen/R600/mad-combine.ll
@@ -0,0 +1,567 @@
+; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
+
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+
+; Make sure we don't form mad with denormals
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.fma.f32(float, float, float) #0
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF-NOT: v_fma
+; SI-DENORM-SLOWFMAF-NOT: v_mad
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fadd float %mul, %c
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fadd float %mul, %c
+  %fma1 = fadd float %mul, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fadd x, (fmul y, z)) -> (fma y, z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fadd float %c, %mul
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fsub float %mul, %c
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fsub float %mul, %c
+  %fma1 = fsub float %mul, %d
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fsub float %c, %mul
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fsub float %c, %mul
+  %fma1 = fsub float %d, %mul
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma = fsub float %mul.neg, %c
+
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma0 = fsub float %mul.neg, %c
+  %fma1 = fsub float %mul.neg, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma0 = fsub float %mul.neg, %c
+  %fma1 = fsub float %mul, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
+  %tmp2 = fsub float %tmp1, %z
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+;   -> (fma (fneg y), z, (fma (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub float %x, %tmp1
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
+  %tmp2 = fsub float %tmp1, %z
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fmuladd y, z, (fmul u, v)))
+;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub float %x, %tmp1
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/mad-sub.ll b/test/CodeGen/R600/mad-sub.ll
index 7b4020d2973c..aa4194ff6106 100644
--- a/test/CodeGen/R600/mad-sub.ll
+++ b/test/CodeGen/R600/mad-sub.ll
@@ -12,15 +12,15 @@ declare float @llvm.fabs.f32(float) #0
 define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
   %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
-  %a = load float addrspace(1)* %gep0, align 4
-  %b = load float addrspace(1)* %gep1, align 4
-  %c = load float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c
   store float %sub, float addrspace(1)* %outgep, align 4
@@ -36,15 +36,15 @@ define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrs
 define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
   %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
-  %a = load float addrspace(1)* %gep0, align 4
-  %b = load float addrspace(1)* %gep1, align 4
-  %c = load float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
   %mul = fmul float %a, %b
   %sub = fsub float %c, %mul
   store float %sub, float addrspace(1)* %outgep, align 4
@@ -57,15 +57,15 @@ define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float a
 define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr double addrspace(1)* %ptr, i64 %tid.ext
+  %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr double addrspace(1)* %ptr, i64 %add1
+  %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1
   %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr double addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr double addrspace(1)* %out, i64 %tid.ext
-  %a = load double addrspace(1)* %gep0, align 8
-  %b = load double addrspace(1)* %gep1, align 8
-  %c = load double addrspace(1)* %gep2, align 8
+  %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load double, double addrspace(1)* %gep0, align 8
+  %b = load double, double addrspace(1)* %gep1, align 8
+  %c = load double, double addrspace(1)* %gep2, align 8
   %mul = fmul double %a, %b
   %sub = fsub double %mul, %c
   store double %sub, double addrspace(1)* %outgep, align 8
@@ -81,15 +81,15 @@ define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double add
 define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
   %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
-  %a = load float addrspace(1)* %gep0, align 4
-  %b = load float addrspace(1)* %gep1, align 4
-  %c = load float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
   %c.abs = call float @llvm.fabs.f32(float %c) #0
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c.abs
@@ -106,15 +106,15 @@ define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float
 define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
   %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
-  %a = load float addrspace(1)* %gep0, align 4
-  %b = load float addrspace(1)* %gep1, align 4
-  %c = load float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
   %c.abs = call float @llvm.fabs.f32(float %c) #0
   %mul = fmul float %a, %b
   %sub = fsub float %c.abs, %mul
@@ -127,15 +127,15 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
 define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
   %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
-  %a = load float addrspace(1)* %gep0, align 4
-  %b = load float addrspace(1)* %gep1, align 4
-  %c = load float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
   %nega = fsub float -0.000000e+00, %a
   %negb = fsub float -0.000000e+00, %b
   %mul = fmul float %nega, %negb
@@ -153,15 +153,15 @@ define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float a
 define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.r600.read.tidig.x() #0
   %tid.ext = sext i32 %tid to i64
-  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
   %add1 = add i64 %tid.ext, 1
-  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
   %add2 = add i64 %tid.ext, 2
-  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
-  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
-  %a = load float addrspace(1)* %gep0, align 4
-  %b = load float addrspace(1)* %gep1, align 4
-  %c = load float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
   %b.abs = call float @llvm.fabs.f32(float %b) #0
   %mul = fmul float %a, %b.abs
   %sub = fsub float %mul, %c
@@ -176,12 +176,12 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
 ; SI: buffer_store_dword [[RESULT]]
 define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float addrspace(1)* %gep.0
-  %r2 = load float addrspace(1)* %gep.1
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
 
   %add = fadd float %r1, %r1
   %r3 = fsub float %r2, %add
@@ -197,12 +197,12 @@ define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in)
 ; SI: buffer_store_dword [[RESULT]]
 define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
 
-  %r1 = load float addrspace(1)* %gep.0
-  %r2 = load float addrspace(1)* %gep.1
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
 
   %add = fadd float %r1, %r1
   %r3 = fsub float %add, %r2
diff --git a/test/CodeGen/R600/madak.ll b/test/CodeGen/R600/madak.ll
new file mode 100644
index 000000000000..933bb016d2c9
--- /dev/null
+++ b/test/CodeGen/R600/madak.ll
@@ -0,0 +1,193 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; FIXME: Enable VI
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; GCN-LABEL: {{^}}madak_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
+define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure this is only folded with one use. This is a code size
+; optimization and if we fold the immediate multiple times, we'll undo
+; it.
+
+; GCN-LABEL: {{^}}madak_2_use_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]]
+; GCN: s_endpgm
+define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
+
+  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %in.gep.0, align 4
+  %b = load float, float addrspace(1)* %in.gep.1, align 4
+  %c = load float, float addrspace(1)* %in.gep.2, align 4
+
+  %mul0 = fmul float %a, %b
+  %mul1 = fmul float %a, %c
+  %madak0 = fadd float %mul0, 10.0
+  %madak1 = fadd float %mul1, 10.0
+
+  store float %madak0, float addrspace(1)* %out.gep.0, align 4
+  store float %madak1, float addrspace(1)* %out.gep.1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
+define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+
+  %mul = fmul float 4.0, %a
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure nothing weird happens with a value that is also allowed as
+; an inline immediate.
+
+; GCN-LABEL: {{^}}madak_inline_imm_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
+define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 4.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; We can't use an SGPR when forming madak
+; GCN-LABEL: {{^}}s_v_madak_f32:
+; GCN: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
+define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: @v_s_madak_f32
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
+define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_s_madak_f32:
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+; GCN: s_endpgm
+define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %mul = fmul float %a.fabs, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
+; GCN: s_endpgm
+define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %mul = fmul float %a, %b.fabs
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/madmk.ll b/test/CodeGen/R600/madmk.ll
new file mode 100644
index 000000000000..ba7bb221a99a
--- /dev/null
+++ b/test/CodeGen/R600/madmk.ll
@@ -0,0 +1,205 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; GCN-LABEL: {{^}}madmk_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madmk_2_use_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]]
+; GCN: s_endpgm
+define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
+
+  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %in.gep.0, align 4
+  %b = load float, float addrspace(1)* %in.gep.1, align 4
+  %c = load float, float addrspace(1)* %in.gep.2, align 4
+
+  %mul0 = fmul float %a, 10.0
+  %mul1 = fmul float %a, 10.0
+  %madmk0 = fadd float %mul0, %b
+  %madmk1 = fadd float %mul1, %c
+
+  store float %madmk0, float addrspace(1)* %out.gep.0, align 4
+  store float %madmk1, float addrspace(1)* %out.gep.1, align 4
+  ret void
+}
+
+; We don't get any benefit if the constant is an inline immediate.
+; GCN-LABEL: {{^}}madmk_inline_imm_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]]
+define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %mul = fmul float %a, 4.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_s_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_s_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_vector_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %b = load float, float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %mul = fmul float %a.fabs, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
+define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b.fabs
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madmk_add_inline_imm_f32:
+; GCN: buffer_load_dword [[A:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
+define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, 2.0
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_madmk_verifier_error:
+; SI: s_xor_b64
+; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c
+; SI: s_or_b64
+define void @kill_madmk_verifier_error() nounwind {
+bb:
+  br label %bb2
+
+bb1:                                              ; preds = %bb2
+  ret void
+
+bb2:                                              ; preds = %bb6, %bb
+  %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ]
+  %tmp3 = fsub float undef, %tmp
+  %tmp5 = fcmp oeq float %tmp3, 1.000000e+04
+  br i1 %tmp5, label %bb1, label %bb6
+
+bb6:                                              ; preds = %bb2
+  %tmp4 = fmul float %tmp, undef
+  %tmp7 = fmul float %tmp4, 0x40E55DD180000000
+  %tmp8 = fadd float %tmp7, undef
+  br label %bb2
+}
diff --git a/test/CodeGen/R600/max.ll b/test/CodeGen/R600/max.ll
index 20af99332453..1aa9e6883011 100644
--- a/test/CodeGen/R600/max.ll
+++ b/test/CodeGen/R600/max.ll
@@ -6,11 +6,11 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: v_max_i32_e32
 define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %outgep, align 4
@@ -26,15 +26,33 @@ define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32:
+; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp sge i32 %a, 9
+  %val = select i1 %cmp, i32 %a, i32 9
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32:
+; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp sgt i32 %a, 9
+  %val = select i1 %cmp, i32 %a, i32 9
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: @v_test_imax_sgt_i32
 ; SI: v_max_i32_e32
 define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
   %cmp = icmp sgt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %outgep, align 4
@@ -54,11 +72,11 @@ define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: v_max_u32_e32
 define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %outgep, align 4
@@ -78,11 +96,11 @@ define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: v_max_u32_e32
 define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
   %cmp = icmp ugt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %outgep, align 4
diff --git a/test/CodeGen/R600/max3.ll b/test/CodeGen/R600/max3.ll
index f905e171b334..cfb94b272e51 100644
--- a/test/CodeGen/R600/max3.ll
+++ b/test/CodeGen/R600/max3.ll
@@ -6,13 +6,13 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: v_max3_i32
 define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
-  %c = load i32 addrspace(1)* %gep2, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
   %icmp0 = icmp sgt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp sgt i32 %i0, %c
@@ -25,13 +25,13 @@ define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 ; SI: v_max3_u32
 define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
-  %c = load i32 addrspace(1)* %gep2, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
   %icmp0 = icmp ugt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp ugt i32 %i0, %c
diff --git a/test/CodeGen/R600/merge-stores.ll b/test/CodeGen/R600/merge-stores.ll
new file mode 100644
index 000000000000..dbf9d4481ffb
--- /dev/null
+++ b/test/CodeGen/R600/merge-stores.ll
@@ -0,0 +1,536 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+; Run with devices with different unaligned load restrictions.
+
+; TODO: Vector element tests
+; TODO: Non-zero base offset for load and store combinations
+; TODO: Same base addrspacecasted
+
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  store i8 456, i8 addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  store i8 456, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
+; GCN: buffer_store_dword v
+define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 123, i16 addrspace(1)* %out.gep.1
+  store i16 456, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
+; GCN: buffer_store_dword v
+define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 0, i16 addrspace(1)* %out.gep.1
+  store i16 0, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 123, i16 addrspace(1)* %out.gep.1
+  store i16 456, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
+; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
+; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
+; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
+  store float 1.0, float addrspace(1)* %out.gep.1.bc
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
+  store i32 123, i32 addrspace(1)* %out.gep.1.bc
+  store float 4.0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
+; GCN: buffer_store_dwordx4
+define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out.gep.2
+  store i32 333, i32 addrspace(1)* %out.gep.3
+  store i32 1234, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
+; XGCN: buffer_store_dwordx4
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dwordx2 v
+define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  store float 8.0, float addrspace(1)* %out
+  store float 1.0, float addrspace(1)* %out.gep.1
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store float 4.0, float addrspace(1)* %out.gep.3
+  ret void
+}
+
+; First store is out of order. Because of order of combines, the
+; consecutive store fails because only some of the stores have been
+; replaced with integer constant stores, and then won't merge because
+; the types are different.
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
+; XGCN: buffer_store_dwordx4
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  store float 1.0, float addrspace(1)* %out.gep.1
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store float 4.0, float addrspace(1)* %out.gep.3
+  store float 8.0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dword
+; SI-NOT: buffer_store_dword
+; GCN: s_endpgm
+define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out.gep.2
+  store i32 1234, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
+; XGCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+
+  store i64 123, i64 addrspace(1)* %out.gep.1
+  store i64 456, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
+; XGCN: buffer_store_dwordx4
+; XGCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
+  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
+
+  store i64 123, i64 addrspace(1)* %out.gep.1
+  store i64 456, i64 addrspace(1)* %out.gep.2
+  store i64 333, i64 addrspace(1)* %out.gep.3
+  store i64 1234, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
+; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx2 [[LOAD]]
+define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+
+  %lo = load i32, i32 addrspace(1)* %in
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %lo, i32 addrspace(1)* %out
+  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
+; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %lo = load i32, i32 addrspace(1)* %in.gep.0
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %lo, i32 addrspace(1)* %out.gep.0
+  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+
+  %lo = load i32, i32 addrspace(1)* %in
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %hi, i32 addrspace(1)* %out
+  store i32 %lo, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx4 [[LOAD]]
+define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
+; SI-DAG: buffer_load_dwordx2
+; SI-DAG: buffer_load_dword v
+; GCN: s_waitcnt
+; SI-DAG: buffer_store_dword v
+; SI-DAG: buffer_store_dwordx2 v
+; GCN: s_endpgm
+define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx4 [[LOAD]]
+define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
+
+  %x = load float, float addrspace(1)* %in
+  %y = load float, float addrspace(1)* %in.gep.1
+  %z = load float, float addrspace(1)* %in.gep.2
+  %w = load float, float addrspace(1)* %in.gep.3
+
+  store float %x, float addrspace(1)* %out
+  store float %y, float addrspace(1)* %out.gep.1
+  store float %z, float addrspace(1)* %out.gep.2
+  store float %w, float addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
+
+  %x = load i32, i32 addrspace(1)* %in.gep.0
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: s_barrier
+; GCN: buffer_store_dwordx4 [[LOAD]]
+define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  ; Make sure the barrier doesn't stop this
+  tail call void @llvm.AMDGPU.barrier.local() #1
+
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %x, i32 addrspace(1)* %out
+
+  ret void
+}
+
+; TODO: Re-packing of loaded register required. Maybe an IR pass
+; should catch this?
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: s_barrier
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  ; Make sure the barrier doesn't stop this
+  tail call void @llvm.AMDGPU.barrier.local() #1
+
+  store i32 %w, i32 addrspace(1)* %out
+  store i32 %z, i32 addrspace(1)* %out.gep.1
+  store i32 %y, i32 addrspace(1)* %out.gep.2
+  store i32 %x, i32 addrspace(1)* %out.gep.3
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
+; GCN: buffer_store_dword [[LOAD]]
+; GCN: s_endpgm
+define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
+  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
+  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+
+  %x = load i8, i8 addrspace(1)* %in, align 4
+  %y = load i8, i8 addrspace(1)* %in.gep.1
+  %z = load i8, i8 addrspace(1)* %in.gep.2
+  %w = load i8, i8 addrspace(1)* %in.gep.3
+
+  store i8 %x, i8 addrspace(1)* %out, align 4
+  store i8 %y, i8 addrspace(1)* %out.gep.1
+  store i8 %z, i8 addrspace(1)* %out.gep.2
+  store i8 %w, i8 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
+  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
+  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+
+  %x = load i8, i8 addrspace(1)* %in
+  %y = load i8, i8 addrspace(1)* %in.gep.1
+  %z = load i8, i8 addrspace(1)* %in.gep.2
+  %w = load i8, i8 addrspace(1)* %in.gep.3
+
+  store i8 %x, i8 addrspace(1)* %out
+  store i8 %y, i8 addrspace(1)* %out.gep.1
+  store i8 %z, i8 addrspace(1)* %out.gep.2
+  store i8 %w, i8 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; This works once AA is enabled on the subtarget
+; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; XGCN: buffer_store_dwordx4 [[LOAD]]
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
+
+  %x = extractelement <4 x i32> %vec, i32 0
+  %y = extractelement <4 x i32> %vec, i32 1
+  %z = extractelement <4 x i32> %vec, i32 2
+  %w = extractelement <4 x i32> %vec, i32 3
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: s_endpgm
+define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+
+  store i8 123, i8 addrspace(3)* %out.gep.1
+  store i8 456, i8 addrspace(3)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
+; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
+; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
+; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
+define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+
+  store i32 123, i32 addrspace(3)* %out.gep.1
+  store i32 456, i32 addrspace(3)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
+
+  store i32 123, i32 addrspace(3)* %out.gep.1
+  store i32 456, i32 addrspace(3)* %out.gep.2
+  store i32 333, i32 addrspace(3)* %out.gep.3
+  store i32 1234, i32 addrspace(3)* %out
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/min.ll b/test/CodeGen/R600/min.ll
index 00ba5c6cddb4..275e9a7d899b 100644
--- a/test/CodeGen/R600/min.ll
+++ b/test/CodeGen/R600/min.ll
@@ -6,11 +6,11 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: v_min_i32_e32
 define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %outgep, align 4
@@ -30,11 +30,11 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: v_min_i32_e32
 define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %outgep, align 4
@@ -50,15 +50,33 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
+; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp slt i32 %a, 8
+  %val = select i1 %cmp, i32 %a, i32 8
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32:
+; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp sle i32 %a, 8
+  %val = select i1 %cmp, i32 %a, i32 8
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: @v_test_umin_ule_i32
 ; SI: v_min_u32_e32
 define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %outgep, align 4
@@ -78,11 +96,11 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: v_min_u32_e32
 define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %outgep, align 4
@@ -106,12 +124,12 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: s_endpgm
 define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %outgep0 = getelementptr i32 addrspace(1)* %out0, i32 %tid
-  %outgep1 = getelementptr i1 addrspace(1)* %out1, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid
+  %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %outgep0, align 4
diff --git a/test/CodeGen/R600/min3.ll b/test/CodeGen/R600/min3.ll
index 6c11a650fcbb..38ef46d1bdd6 100644
--- a/test/CodeGen/R600/min3.ll
+++ b/test/CodeGen/R600/min3.ll
@@ -6,13 +6,13 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: v_min3_i32
 define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
-  %c = load i32 addrspace(1)* %gep2, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
   %icmp0 = icmp slt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp slt i32 %i0, %c
@@ -25,13 +25,13 @@ define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 ; SI: v_min3_u32
 define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
-  %c = load i32 addrspace(1)* %gep2, align 4
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
   %icmp0 = icmp ult i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp ult i32 %i0, %c
@@ -46,21 +46,21 @@ define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
 
-  %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2
-  %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2
-  %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2
+  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
 
-  %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2
+  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
 
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
-  %c = load i32 addrspace(1)* %gep2, align 4
-  %d = load i32 addrspace(1)* %gep3, align 4
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %d = load i32, i32 addrspace(1)* %gep3, align 4
 
   %icmp0 = icmp slt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
@@ -80,21 +80,21 @@ define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %ap
 define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
-  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
 
-  %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2
-  %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2
-  %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2
+  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
 
-  %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2
+  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
 
-  %a = load i32 addrspace(1)* %gep0, align 4
-  %b = load i32 addrspace(1)* %gep1, align 4
-  %c = load i32 addrspace(1)* %gep2, align 4
-  %d = load i32 addrspace(1)* %gep3, align 4
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %d = load i32, i32 addrspace(1)* %gep3, align 4
 
   %icmp0 = icmp slt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
diff --git a/test/CodeGen/R600/misaligned-load.ll b/test/CodeGen/R600/misaligned-load.ll
deleted file mode 100644
index 6290ca09d502..000000000000
--- a/test/CodeGen/R600/misaligned-load.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI: @byte_aligned_load64
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: s_endpgm
-define void @byte_aligned_load64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) {
-entry:
-  %0 = load i64 addrspace(3)* %in, align 1
-  store i64 %0, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/missing-store.ll b/test/CodeGen/R600/missing-store.ll
index 8ddef35a694a..4af9cdf1b960 100644
--- a/test/CodeGen/R600/missing-store.ll
+++ b/test/CodeGen/R600/missing-store.ll
@@ -12,11 +12,11 @@
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
-  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
   store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
 
   store i32 %tmp2, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
index 9c2a17ce04f4..b19163f294e0 100644
--- a/test/CodeGen/R600/mubuf.ll
+++ b/test/CodeGen/R600/mubuf.ll
@@ -11,8 +11,8 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 ; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
 define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %0 = getelementptr i32 addrspace(1)* %in, i64 1
-  %1 = load i32 addrspace(1)* %0
+  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
+  %1 = load i32, i32 addrspace(1)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
@@ -22,19 +22,20 @@ entry:
 ; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
 define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
-  %0 = getelementptr i8 addrspace(1)* %in, i64 4095
-  %1 = load i8 addrspace(1)* %0
+  %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
+  %1 = load i8, i8 addrspace(1)* %0
   store i8 %1, i8 addrspace(1)* %out
   ret void
 }
 
 ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load2:
-; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80,0x30,0xe0
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
+; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
 define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %0 = getelementptr i32 addrspace(1)* %in, i64 1024
-  %1 = load i32 addrspace(1)* %0
+  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
+  %1 = load i32, i32 addrspace(1)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
@@ -45,9 +46,9 @@ entry:
 ; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
 define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
-  %0 = getelementptr i32 addrspace(1)* %in, i64 %offset
-  %1 = getelementptr i32 addrspace(1)* %0, i64 1
-  %2 = load i32 addrspace(1)* %1
+  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset
+  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
+  %2 = load i32, i32 addrspace(1)* %1
   store i32 %2, i32 addrspace(1)* %out
   ret void
 }
@@ -56,8 +57,8 @@ entry:
 ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
 define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
 main_body:
-  %tmp0 = getelementptr [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
-  %tmp1 = load <16 x i8> addrspace(2)* %tmp0
+  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
@@ -75,8 +76,8 @@ main_body:
 ; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
 define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
 main_body:
-  %tmp0 = getelementptr [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
-  %tmp1 = load <16 x i8> addrspace(2)* %tmp0
+  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
@@ -85,12 +86,6 @@ main_body:
   ret void
 }
 
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-
-attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" }
-attributes #3 = { nounwind readonly }
-
 ;;;==========================================================================;;;
 ;;; MUBUF STORE TESTS
 ;;;==========================================================================;;;
@@ -100,7 +95,7 @@ attributes #3 = { nounwind readonly }
 ; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
 define void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
-  %0 = getelementptr i32 addrspace(1)* %out, i64 1
+  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
   store i32 0, i32 addrspace(1)* %0
   ret void
 }
@@ -111,17 +106,18 @@ entry:
 
 define void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
-  %0 = getelementptr i8 addrspace(1)* %out, i64 4095
+  %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095
   store i8 0, i8 addrspace(1)* %0
   ret void
 }
 
 ; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store2:
-; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80,0x70,0xe0
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
+; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
 define void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
-  %0 = getelementptr i32 addrspace(1)* %out, i64 1024
+  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
   store i32 0, i32 addrspace(1)* %0
   ret void
 }
@@ -132,8 +128,8 @@ entry:
 ; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
 define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
-  %0 = getelementptr i32 addrspace(1)* %out, i64 %offset
-  %1 = getelementptr i32 addrspace(1)* %0, i64 1
+  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset
+  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
   store i32 0, i32 addrspace(1)* %1
   ret void
 }
@@ -148,24 +144,40 @@ define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
 define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
-  %out.gep = getelementptr i32 addrspace(1)* %out, i32 10
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
-; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
 define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
-  %out.gep = getelementptr i32 addrspace(1)* %out, i32 32768
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
+; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
+; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
+; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
+  ret void
+}
+
 ; CHECK-LABEL: {{^}}store_vgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
+
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" }
+attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index 6f15e706dff8..94e0f96b323e 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -12,9 +12,9 @@
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
   %result = mul <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -32,9 +32,9 @@ define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
   %result = mul <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -58,8 +58,8 @@ define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 ; SI: v_mul_lo_i32
 ; SI: buffer_store_dword
 define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64 addrspace(1)* %aptr, align 8
-  %b = load i64 addrspace(1)* %bptr, align 8
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -88,7 +88,7 @@ entry:
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
 define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 80
   store i64 %mul, i64 addrspace(1)* %out, align 8
@@ -100,7 +100,7 @@ define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
 ; SI: s_endpgm
 define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 9
   store i64 %mul, i64 addrspace(1)* %out, align 8
@@ -123,9 +123,9 @@ define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; FUNC-LABEL: {{^}}v_mul_i32:
 ; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %a = load i32 addrspace(1)* %in
-  %b = load i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = mul i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -148,8 +148,8 @@ define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ; FUNC-LABEL: {{^}}v_mul_i64:
 ; SI: v_mul_lo_i32
 define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
-  %a = load i64 addrspace(1)* %aptr, align 8
-  %b = load i64 addrspace(1)* %bptr, align 8
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %a, %b
   store i64 %mul, i64 addrspace(1)* %out, align 8
   ret void
@@ -163,7 +163,7 @@ entry:
   br i1 %0, label %if, label %else
 
 if:
-  %1 = load i32 addrspace(1)* %in
+  %1 = load i32, i32 addrspace(1)* %in
   br label %endif
 
 else:
@@ -186,7 +186,7 @@ entry:
   br i1 %0, label %if, label %else
 
 if:
-  %1 = load i64 addrspace(1)* %in
+  %1 = load i64, i64 addrspace(1)* %in
   br label %endif
 
 else:
diff --git a/test/CodeGen/R600/no-initializer-constant-addrspace.ll b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
index 532edf07c301..9a814b579deb 100644
--- a/test/CodeGen/R600/no-initializer-constant-addrspace.ll
+++ b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
@@ -6,7 +6,7 @@
 
 ; FUNC-LABEL: {{^}}load_extern_const_init:
 define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
-  %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
+  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -15,7 +15,7 @@ define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
 
 ; FUNC-LABEL: {{^}}load_undef_const_init:
 define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
-  %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
+  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/no-shrink-extloads.ll b/test/CodeGen/R600/no-shrink-extloads.ll
index 135d22d3036d..e4328ecbaca8 100644
--- a/test/CodeGen/R600/no-shrink-extloads.ll
+++ b/test/CodeGen/R600/no-shrink-extloads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
@@ -23,9 +23,9 @@ define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounw
 ; SI: buffer_store_short v
 define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i16 addrspace(1)* %out, i32 %tid
-  %load = load i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep.in
   %trunc = trunc i32 %load to i16
   store i16 %trunc, i16 addrspace(1)* %gep.out
   ret void
@@ -45,9 +45,9 @@ define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwin
 ; SI: buffer_store_byte v
 define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
-  %load = load i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep.in
   %trunc = trunc i32 %load to i8
   store i8 %trunc, i8 addrspace(1)* %gep.out
   ret void
@@ -67,9 +67,9 @@ define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwin
 ; SI: buffer_store_byte v
 define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i1 addrspace(1)* %out, i32 %tid
-  %load = load i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep.in
   %trunc = trunc i32 %load to i1
   store i1 %trunc, i1 addrspace(1)* %gep.out
   ret void
@@ -89,9 +89,9 @@ define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounw
 ; SI: buffer_store_dword v
 define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %load = load i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
   %trunc = trunc i64 %load to i32
   store i32 %trunc, i32 addrspace(1)* %gep.out
   ret void
@@ -112,9 +112,9 @@ define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
 ; SI: buffer_store_dword v
 define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %load = load i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
   %srl = lshr i64 %load, 32
   %trunc = trunc i64 %srl to i32
   store i32 %trunc, i32 addrspace(1)* %gep.out
@@ -136,9 +136,9 @@ define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwin
 ; SI: buffer_store_byte v
 define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i16 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
-  %load = load i16 addrspace(1)* %gep.in
+  %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i16, i16 addrspace(1)* %gep.in
   %trunc = trunc i16 %load to i8
   store i8 %trunc, i8 addrspace(1)* %gep.out
   ret void
@@ -159,9 +159,9 @@ define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
 ; SI: buffer_store_byte v
 define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
-  %load = load i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
   %srl = lshr i64 %load, 32
   %trunc = trunc i64 %srl to i8
   store i8 %trunc, i8 addrspace(1)* %gep.out
@@ -182,9 +182,9 @@ define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwin
 ; SI: buffer_store_byte v
 define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
-  %load = load i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
   %trunc = trunc i64 %load to i8
   store i8 %trunc, i8 addrspace(1)* %gep.out
   ret void
diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll
index 88a8145dcd62..816755efb07c 100644
--- a/test/CodeGen/R600/operand-folding.ll
+++ b/test/CodeGen/R600/operand-folding.ll
@@ -10,7 +10,7 @@ entry:
 if:
   %id = call i32 @llvm.r600.read.tidig.x()
   %offset = add i32 %fold, %id
-  %tmp1 = getelementptr i32 addrspace(1)* %out, i32 %offset
+  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
   store i32 0, i32 addrspace(1)* %tmp1
   br label %endif
 
@@ -19,7 +19,7 @@ endif:
 }
 
 ; CHECK-LABEL: {{^}}fold_imm:
-; CHECK v_or_i32_e32 v{{[0-9]+}}, 5
+; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
 define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
 entry:
   %fold = add i32 3, 2
diff --git a/test/CodeGen/R600/operand-spacing.ll b/test/CodeGen/R600/operand-spacing.ll
index dd9f25aad7f2..20420a84de6f 100644
--- a/test/CodeGen/R600/operand-spacing.ll
+++ b/test/CodeGen/R600/operand-spacing.ll
@@ -1,13 +1,16 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
 
 ; Make sure there isn't an extra space between the instruction name and first operands.
 
-; SI-LABEL: {{^}}add_f32:
+; GCN-LABEL: {{^}}add_f32:
 ; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
-; SI: buffer_store_dword [[RESULT]],
+; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+; GCN: buffer_store_dword [[RESULT]],
 define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
   %result = fadd float %a, %b
   store float %result, float addrspace(1)* %out
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 0d9a6992a6bd..1c04090b407f 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -2,45 +2,42 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; EG-LABEL: {{^}}or_v2i32:
+
+; FUNC-LABEL: {{^}}or_v2i32:
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI-LABEL: {{^}}or_v2i32:
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
 define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
   %result = or <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}or_v4i32:
+; FUNC-LABEL: {{^}}or_v4i32:
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI-LABEL: {{^}}or_v4i32:
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
 define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
   %result = or <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: {{^}}scalar_or_i32:
+; FUNC-LABEL: {{^}}scalar_or_i32:
 ; SI: s_or_b32
 define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %or = or i32 %a, %b
@@ -48,16 +45,16 @@ define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_i32:
+; FUNC-LABEL: {{^}}vector_or_i32:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
-  %loada = load i32 addrspace(1)* %a
+  %loada = load i32, i32 addrspace(1)* %a
   %or = or i32 %loada, %b
   store i32 %or, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: {{^}}scalar_or_literal_i32:
+; FUNC-LABEL: {{^}}scalar_or_literal_i32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
 define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
   %or = or i32 %a, 99999
@@ -65,28 +62,28 @@ define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_literal_i32:
+; FUNC-LABEL: {{^}}vector_or_literal_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
 define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
-  %loada = load i32 addrspace(1)* %a, align 4
+  %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 65535
   store i32 %or, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_inline_immediate_i32:
+; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
 define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
-  %loada = load i32 addrspace(1)* %a, align 4
+  %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 4
   store i32 %or, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-LABEL: {{^}}scalar_or_i64:
+; FUNC-LABEL: {{^}}scalar_or_i64:
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; SI-LABEL: {{^}}scalar_or_i64:
+
 ; SI: s_or_b64
 define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, %b
@@ -94,28 +91,28 @@ define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_i64:
+; FUNC-LABEL: {{^}}vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
 define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 8
-  %loadb = load i64 addrspace(1)* %a, align 8
+  %loada = load i64, i64 addrspace(1)* %a, align 8
+  %loadb = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, %loadb
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: {{^}}scalar_vector_or_i64:
+; FUNC-LABEL: {{^}}scalar_vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
 define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
-  %loada = load i64 addrspace(1)* %a
+  %loada = load i64, i64 addrspace(1)* %a
   %or = or i64 %loada, %b
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: {{^}}vector_or_i64_loadimm:
+; FUNC-LABEL: {{^}}vector_or_i64_loadimm:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
 ; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
@@ -123,26 +120,26 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 8
+  %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
 ; FIXME: The or 0 should really be removed.
-; SI-LABEL: {{^}}vector_or_i64_imm:
+; FUNC-LABEL: {{^}}vector_or_i64_imm:
 ; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
 ; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}}
 ; SI: s_endpgm
 define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 8
+  %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 8
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: {{^}}trunc_i64_or_to_i32:
+; FUNC-LABEL: {{^}}trunc_i64_or_to_i32:
 ; SI: s_load_dword s[[SREG0:[0-9]+]]
 ; SI: s_load_dword s[[SREG1:[0-9]+]]
 ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
@@ -155,18 +152,27 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   ret void
 }
 
-; EG-CHECK: {{^}}or_i1:
-; EG-CHECK: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; FUNC-LABEL: {{^}}or_i1:
+; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
 
-; SI-CHECK: {{^}}or_i1:
-; SI-CHECK: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
-define void @or_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
-  %a = load float addrspace(1) * %in0
-  %b = load float addrspace(1) * %in1
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+  %a = load float, float addrspace(1)* %in0
+  %b = load float, float addrspace(1)* %in1
   %acmp = fcmp oge float %a, 0.000000e+00
   %bcmp = fcmp oge float %b, 0.000000e+00
   %or = or i1 %acmp, %bcmp
-  %result = select i1 %or, float %a, float %b
-  store float %result, float addrspace(1)* %out
+  %result = zext i1 %or to i32
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_or_i1:
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+  %cmp0 = icmp eq i32 %a, %b
+  %cmp1 = icmp eq i32 %c, %d
+  %or = or i1 %cmp0, %cmp1
+  store i1 %or, i1 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll
index 82b11501e865..f32b044198ab 100644
--- a/test/CodeGen/R600/parallelandifcollapse.ll
+++ b/test/CodeGen/R600/parallelandifcollapse.ll
@@ -23,14 +23,14 @@ entry:
   %c1 = alloca i32, align 4
   %d1 = alloca i32, align 4
   %data = alloca i32, align 4
-  %0 = load i32* %a0, align 4
-  %1 = load i32* %b0, align 4
+  %0 = load i32, i32* %a0, align 4
+  %1 = load i32, i32* %b0, align 4
   %cmp = icmp ne i32 %0, %1
   br i1 %cmp, label %land.lhs.true, label %if.end
 
 land.lhs.true:                                    ; preds = %entry
-  %2 = load i32* %c0, align 4
-  %3 = load i32* %d0, align 4
+  %2 = load i32, i32* %c0, align 4
+  %3 = load i32, i32* %d0, align 4
   %cmp1 = icmp ne i32 %2, %3
   br i1 %cmp1, label %if.then, label %if.end
 
@@ -39,14 +39,14 @@ if.then:                                          ; preds = %land.lhs.true
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
-  %4 = load i32* %a1, align 4
-  %5 = load i32* %b1, align 4
+  %4 = load i32, i32* %a1, align 4
+  %5 = load i32, i32* %b1, align 4
   %cmp2 = icmp ne i32 %4, %5
   br i1 %cmp2, label %land.lhs.true3, label %if.end6
 
 land.lhs.true3:                                   ; preds = %if.end
-  %6 = load i32* %c1, align 4
-  %7 = load i32* %d1, align 4
+  %6 = load i32, i32* %c1, align 4
+  %7 = load i32, i32* %d1, align 4
   %cmp4 = icmp ne i32 %6, %7
   br i1 %cmp4, label %if.then5, label %if.end6
 
diff --git a/test/CodeGen/R600/parallelorifcollapse.ll b/test/CodeGen/R600/parallelorifcollapse.ll
index feca688c30aa..1da1e91b8ab8 100644
--- a/test/CodeGen/R600/parallelorifcollapse.ll
+++ b/test/CodeGen/R600/parallelorifcollapse.ll
@@ -23,14 +23,14 @@ entry:
   %c1 = alloca i32, align 4
   %d1 = alloca i32, align 4
   %data = alloca i32, align 4
-  %0 = load i32* %a0, align 4
-  %1 = load i32* %b0, align 4
+  %0 = load i32, i32* %a0, align 4
+  %1 = load i32, i32* %b0, align 4
   %cmp = icmp ne i32 %0, %1
   br i1 %cmp, label %land.lhs.true, label %if.else
 
 land.lhs.true:                                    ; preds = %entry
-  %2 = load i32* %c0, align 4
-  %3 = load i32* %d0, align 4
+  %2 = load i32, i32* %c0, align 4
+  %3 = load i32, i32* %d0, align 4
   %cmp1 = icmp ne i32 %2, %3
   br i1 %cmp1, label %if.then, label %if.else
 
@@ -42,14 +42,14 @@ if.else:                                          ; preds = %land.lhs.true, %ent
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
-  %4 = load i32* %a1, align 4
-  %5 = load i32* %b1, align 4
+  %4 = load i32, i32* %a1, align 4
+  %5 = load i32, i32* %b1, align 4
   %cmp2 = icmp ne i32 %4, %5
   br i1 %cmp2, label %land.lhs.true3, label %if.else6
 
 land.lhs.true3:                                   ; preds = %if.end
-  %6 = load i32* %c1, align 4
-  %7 = load i32* %d1, align 4
+  %6 = load i32, i32* %c1, align 4
+  %7 = load i32, i32* %d1, align 4
   %cmp4 = icmp ne i32 %6, %7
   br i1 %cmp4, label %if.then5, label %if.else6
 
diff --git a/test/CodeGen/R600/private-memory-atomics.ll b/test/CodeGen/R600/private-memory-atomics.ll
index 3ceb0c00d114..a008ac98a43b 100644
--- a/test/CodeGen/R600/private-memory-atomics.ll
+++ b/test/CodeGen/R600/private-memory-atomics.ll
@@ -7,11 +7,11 @@
 define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
   store i32 0, i32* %tmp1
   store i32 1, i32* %tmp2
-  %tmp3 = getelementptr [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
   %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel
   store i32 %tmp4, i32 addrspace(1)* %out
   ret void
@@ -20,11 +20,11 @@ entry:
 define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
   store i32 0, i32* %tmp1
   store i32 1, i32* %tmp2
-  %tmp3 = getelementptr [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
   %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic
   %val = extractvalue { i32, i1 } %tmp4, 0
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/R600/private-memory-broken.ll b/test/CodeGen/R600/private-memory-broken.ll
index 10590a9802fb..6b18a19f1956 100644
--- a/test/CodeGen/R600/private-memory-broken.ll
+++ b/test/CodeGen/R600/private-memory-broken.ll
@@ -10,11 +10,11 @@ declare i32 @foo(i32*) nounwind
 define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
   store i32 0, i32* %tmp1
   store i32 1, i32* %tmp2
-  %tmp3 = getelementptr [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
   %val = call i32 @foo(i32* %tmp3) nounwind
   store i32 %val, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index 15153c69a48d..1c5629780508 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
 ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
@@ -21,19 +23,19 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
-  %0 = load i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 %0
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
   store i32 4, i32* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %in, i32 1
-  %1 = load i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 %1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
   store i32 5, i32* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 0
-  %2 = load i32* %arrayidx10, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
   store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 1
-  %3 = load i32* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32 addrspace(1)* %out, i32 1
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
   store i32 %3, i32 addrspace(1)* %arrayidx13
   ret void
 }
@@ -55,18 +57,18 @@ define void @multiple_structs(i32 addrspace(1)* %out) {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
-  %a.x.ptr = getelementptr %struct.point* %a, i32 0, i32 0
-  %a.y.ptr = getelementptr %struct.point* %a, i32 0, i32 1
-  %b.x.ptr = getelementptr %struct.point* %b, i32 0, i32 0
-  %b.y.ptr = getelementptr %struct.point* %b, i32 0, i32 1
+  %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+  %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
+  %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+  %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
   store i32 0, i32* %a.x.ptr
   store i32 1, i32* %a.y.ptr
   store i32 2, i32* %b.x.ptr
   store i32 3, i32* %b.y.ptr
-  %a.indirect.ptr = getelementptr %struct.point* %a, i32 0, i32 0
-  %b.indirect.ptr = getelementptr %struct.point* %b, i32 0, i32 0
-  %a.indirect = load i32* %a.indirect.ptr
-  %b.indirect = load i32* %b.indirect.ptr
+  %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+  %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+  %a.indirect = load i32, i32* %a.indirect.ptr
+  %b.indirect = load i32, i32* %b.indirect.ptr
   %0 = add i32 %a.indirect, %b.indirect
   store i32 %0, i32 addrspace(1)* %out
   ret void
@@ -84,21 +86,21 @@ define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
-  %a = load i32 addrspace(1)* %in
-  %b_src_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %b = load i32 addrspace(1)* %b_src_ptr
-  %a_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
+  %a = load i32, i32 addrspace(1)* %in
+  %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %b = load i32, i32 addrspace(1)* %b_src_ptr
+  %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
   store i32 %a, i32* %a_dst_ptr
-  %b_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 1
+  %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
   store i32 %b, i32* %b_dst_ptr
   br label %for.body
 
 for.body:
   %inc = phi i32 [0, %entry], [%count, %for.body]
-  %x_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
-  %x = load i32* %x_ptr
-  %y_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
-  %y = load i32* %y_ptr
+  %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+  %x = load i32, i32* %x_ptr
+  %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %y = load i32, i32* %y_ptr
   %xy = add i32 %x, %y
   store i32 %xy, i32* %y_ptr
   %count = add i32 %inc, 1
@@ -106,8 +108,8 @@ for.body:
   br i1 %done, label %for.end, label %for.body
 
 for.end:
-  %value_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
-  %value = load i32* %value_ptr
+  %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %value = load i32, i32* %value_ptr
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
@@ -122,12 +124,12 @@ for.end:
 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = alloca [2 x i16]
-  %1 = getelementptr [2 x i16]* %0, i32 0, i32 0
-  %2 = getelementptr [2 x i16]* %0, i32 0, i32 1
+  %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1
   store i16 0, i16* %1
   store i16 1, i16* %2
-  %3 = getelementptr [2 x i16]* %0, i32 0, i32 %index
-  %4 = load i16* %3
+  %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index
+  %4 = load i16, i16* %3
   %5 = sext i16 %4 to i32
   store i32 %5, i32 addrspace(1)* %out
   ret void
@@ -142,12 +144,12 @@ entry:
 define void @char_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = alloca [2 x i8]
-  %1 = getelementptr [2 x i8]* %0, i32 0, i32 0
-  %2 = getelementptr [2 x i8]* %0, i32 0, i32 1
+  %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1
   store i8 0, i8* %1
   store i8 1, i8* %2
-  %3 = getelementptr [2 x i8]* %0, i32 0, i32 %index
-  %4 = load i8* %3
+  %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index
+  %4 = load i8, i8* %3
   %5 = sext i8 %4 to i32
   store i32 %5, i32 addrspace(1)* %out
   ret void
@@ -165,12 +167,12 @@ entry:
 define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
-  %1 = getelementptr [2 x i32]* %0, i32 0, i32 0
-  %2 = getelementptr [2 x i32]* %0, i32 0, i32 1
+  %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
   store i32 0, i32* %1
   store i32 1, i32* %2
-  %3 = getelementptr [2 x i32]* %0, i32 0, i32 %in
-  %4 = load i32* %3
+  %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
+  %4 = load i32, i32* %3
   %5 = call i32 @llvm.r600.read.tidig.x()
   %6 = add i32 %4, %5
   store i32 %6, i32 addrspace(1)* %out
@@ -188,20 +190,20 @@ define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
-  %2 = getelementptr [3 x i8]* %0, i32 0, i32 0
-  %3 = getelementptr [3 x i8]* %0, i32 0, i32 1
-  %4 = getelementptr [3 x i8]* %0, i32 0, i32 2
-  %5 = getelementptr [2 x i8]* %1, i32 0, i32 0
-  %6 = getelementptr [2 x i8]* %1, i32 0, i32 1
+  %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
+  %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
+  %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
+  %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
+  %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
   store i8 0, i8* %2
   store i8 1, i8* %3
   store i8 2, i8* %4
   store i8 1, i8* %5
   store i8 0, i8* %6
-  %7 = getelementptr [3 x i8]* %0, i32 0, i32 %in
-  %8 = getelementptr [2 x i8]* %1, i32 0, i32 %in
-  %9 = load i8* %7
-  %10 = load i8* %8
+  %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
+  %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
+  %9 = load i8, i8* %7
+  %10 = load i8, i8* %8
   %11 = add i8 %9, %10
   %12 = sext i8 %11 to i32
   store i32 %12, i32 addrspace(1)* %out
@@ -211,12 +213,12 @@ entry:
 define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %alloca = alloca [2 x [2 x i8]]
-  %gep0 = getelementptr [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
+  %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
   store i8 0, i8* %gep0
   store i8 1, i8* %gep1
-  %gep2 = getelementptr [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
-  %load = load i8* %gep2
+  %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i8, i8* %gep2
   %sext = sext i8 %load to i32
   store i32 %sext, i32 addrspace(1)* %out
   ret void
@@ -225,12 +227,12 @@ entry:
 define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %alloca = alloca [2 x [2 x i32]]
-  %gep0 = getelementptr [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+  %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
   store i32 0, i32* %gep0
   store i32 1, i32* %gep1
-  %gep2 = getelementptr [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
-  %load = load i32* %gep2
+  %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i32, i32* %gep2
   store i32 %load, i32 addrspace(1)* %out
   ret void
 }
@@ -238,12 +240,12 @@ entry:
 define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
 entry:
   %alloca = alloca [2 x [2 x i64]]
-  %gep0 = getelementptr [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
+  %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
   store i64 0, i64* %gep0
   store i64 1, i64* %gep1
-  %gep2 = getelementptr [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
-  %load = load i64* %gep2
+  %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i64, i64* %gep2
   store i64 %load, i64 addrspace(1)* %out
   ret void
 }
@@ -253,12 +255,12 @@ entry:
 define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
-  %gep0 = getelementptr [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
-  %gep1 = getelementptr [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
+  %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
   store i32 0, i32* %gep0
   store i32 1, i32* %gep1
-  %gep2 = getelementptr [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
-  %load = load i32* %gep2
+  %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
   store i32 %load, i32 addrspace(1)* %out
   ret void
 }
@@ -266,12 +268,12 @@ entry:
 define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %alloca = alloca [2 x %struct.pair32]
-  %gep0 = getelementptr [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
-  %gep1 = getelementptr [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
+  %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
   store i32 0, i32* %gep0
   store i32 1, i32* %gep1
-  %gep2 = getelementptr [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
-  %load = load i32* %gep2
+  %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
   store i32 %load, i32 addrspace(1)* %out
   ret void
 }
@@ -279,13 +281,13 @@ entry:
 define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
-  %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
   store i32 0, i32* %tmp1
   store i32 1, i32* %tmp2
   %cmp = icmp eq i32 %in, 0
   %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
-  %load = load i32* %sel
+  %load = load i32, i32* %sel
   store i32 %load, i32 addrspace(1)* %out
   ret void
 }
@@ -299,13 +301,13 @@ entry:
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5
 define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %alloca = alloca [16 x i32]
-  %tmp0 = getelementptr [16 x i32]* %alloca, i32 0, i32 %a
+  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
   %tmp1 = ptrtoint [16 x i32]* %alloca to i32
   %tmp2 = add i32 %tmp1, 5
   %tmp3 = inttoptr i32 %tmp2 to i32*
-  %tmp4 = getelementptr i32* %tmp3, i32 %b
-  %tmp5 = load i32* %tmp4
+  %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
+  %tmp5 = load i32, i32* %tmp4
   store i32 %tmp5, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/pv-packing.ll b/test/CodeGen/R600/pv-packing.ll
index e5615b99728e..abeae563ff3f 100644
--- a/test/CodeGen/R600/pv-packing.ll
+++ b/test/CodeGen/R600/pv-packing.ll
@@ -14,8 +14,8 @@ main_body:
   %6 = extractelement <4 x float> %reg3, i32 0
   %7 = extractelement <4 x float> %reg3, i32 1
   %8 = extractelement <4 x float> %reg3, i32 2
-  %9 = load <4 x float> addrspace(8)* null
-  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %9 = load <4 x float>, <4 x float> addrspace(8)* null
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
   %12 = fmul float %0, %3
   %13 = fadd float %12, %6
diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
index 1908f15949a2..9a57dd19765a 100644
--- a/test/CodeGen/R600/pv.ll
+++ b/test/CodeGen/R600/pv.ll
@@ -33,63 +33,63 @@ main_body:
   %25 = extractelement <4 x float> %reg7, i32 1
   %26 = extractelement <4 x float> %reg7, i32 2
   %27 = extractelement <4 x float> %reg7, i32 3
-  %28 = load <4 x float> addrspace(8)* null
+  %28 = load <4 x float>, <4 x float> addrspace(8)* null
   %29 = extractelement <4 x float> %28, i32 0
   %30 = fmul float %0, %29
-  %31 = load <4 x float> addrspace(8)* null
+  %31 = load <4 x float>, <4 x float> addrspace(8)* null
   %32 = extractelement <4 x float> %31, i32 1
   %33 = fmul float %0, %32
-  %34 = load <4 x float> addrspace(8)* null
+  %34 = load <4 x float>, <4 x float> addrspace(8)* null
   %35 = extractelement <4 x float> %34, i32 2
   %36 = fmul float %0, %35
-  %37 = load <4 x float> addrspace(8)* null
+  %37 = load <4 x float>, <4 x float> addrspace(8)* null
   %38 = extractelement <4 x float> %37, i32 3
   %39 = fmul float %0, %38
-  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %41 = extractelement <4 x float> %40, i32 0
   %42 = fmul float %1, %41
   %43 = fadd float %42, %30
-  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %45 = extractelement <4 x float> %44, i32 1
   %46 = fmul float %1, %45
   %47 = fadd float %46, %33
-  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %49 = extractelement <4 x float> %48, i32 2
   %50 = fmul float %1, %49
   %51 = fadd float %50, %36
-  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %53 = extractelement <4 x float> %52, i32 3
   %54 = fmul float %1, %53
   %55 = fadd float %54, %39
-  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %57 = extractelement <4 x float> %56, i32 0
   %58 = fmul float %2, %57
   %59 = fadd float %58, %43
-  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %61 = extractelement <4 x float> %60, i32 1
   %62 = fmul float %2, %61
   %63 = fadd float %62, %47
-  %64 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %65 = extractelement <4 x float> %64, i32 2
   %66 = fmul float %2, %65
   %67 = fadd float %66, %51
-  %68 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %69 = extractelement <4 x float> %68, i32 3
   %70 = fmul float %2, %69
   %71 = fadd float %70, %55
-  %72 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %73 = extractelement <4 x float> %72, i32 0
   %74 = fmul float %3, %73
   %75 = fadd float %74, %59
-  %76 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %77 = extractelement <4 x float> %76, i32 1
   %78 = fmul float %3, %77
   %79 = fadd float %78, %63
-  %80 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %81 = extractelement <4 x float> %80, i32 2
   %82 = fmul float %3, %81
   %83 = fadd float %82, %67
-  %84 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %85 = extractelement <4 x float> %84, i32 3
   %86 = fmul float %3, %85
   %87 = fadd float %86, %71
@@ -107,15 +107,15 @@ main_body:
   %99 = fmul float %4, %98
   %100 = fmul float %5, %98
   %101 = fmul float %6, %98
-  %102 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %103 = extractelement <4 x float> %102, i32 0
   %104 = fmul float %103, %8
   %105 = fadd float %104, %20
-  %106 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %107 = extractelement <4 x float> %106, i32 1
   %108 = fmul float %107, %9
   %109 = fadd float %108, %21
-  %110 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %111 = extractelement <4 x float> %110, i32 2
   %112 = fmul float %111, %10
   %113 = fadd float %112, %22
@@ -123,11 +123,11 @@ main_body:
   %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00)
   %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00)
   %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
-  %118 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
   %119 = extractelement <4 x float> %118, i32 0
-  %120 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
   %121 = extractelement <4 x float> %120, i32 1
-  %122 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
   %123 = extractelement <4 x float> %122, i32 2
   %124 = insertelement <4 x float> undef, float %99, i32 0
   %125 = insertelement <4 x float> %124, float %100, i32 1
@@ -138,11 +138,11 @@ main_body:
   %130 = insertelement <4 x float> %129, float %123, i32 2
   %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
   %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
-  %133 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
   %134 = extractelement <4 x float> %133, i32 0
-  %135 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
   %136 = extractelement <4 x float> %135, i32 1
-  %137 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
   %138 = extractelement <4 x float> %137, i32 2
   %139 = insertelement <4 x float> undef, float %99, i32 0
   %140 = insertelement <4 x float> %139, float %100, i32 1
@@ -153,31 +153,31 @@ main_body:
   %145 = insertelement <4 x float> %144, float %138, i32 2
   %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
   %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
-  %148 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
   %149 = extractelement <4 x float> %148, i32 0
   %150 = fmul float %149, %8
-  %151 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
   %152 = extractelement <4 x float> %151, i32 1
   %153 = fmul float %152, %9
-  %154 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
   %155 = extractelement <4 x float> %154, i32 2
   %156 = fmul float %155, %10
-  %157 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
   %158 = extractelement <4 x float> %157, i32 0
   %159 = fmul float %158, %12
-  %160 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
   %161 = extractelement <4 x float> %160, i32 1
   %162 = fmul float %161, %13
-  %163 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
   %164 = extractelement <4 x float> %163, i32 2
   %165 = fmul float %164, %14
-  %166 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
   %167 = extractelement <4 x float> %166, i32 0
   %168 = fmul float %167, %16
-  %169 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
   %170 = extractelement <4 x float> %169, i32 1
   %171 = fmul float %170, %17
-  %172 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
   %173 = extractelement <4 x float> %172, i32 2
   %174 = fmul float %173, %18
   %175 = fcmp uge float %132, 0.000000e+00
diff --git a/test/CodeGen/R600/r600-export-fix.ll b/test/CodeGen/R600/r600-export-fix.ll
index 7d7285632078..7cb80195b368 100644
--- a/test/CodeGen/R600/r600-export-fix.ll
+++ b/test/CodeGen/R600/r600-export-fix.ll
@@ -16,83 +16,83 @@ main_body:
   %1 = extractelement <4 x float> %reg1, i32 1
   %2 = extractelement <4 x float> %reg1, i32 2
   %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %5 = extractelement <4 x float> %4, i32 0
   %6 = fmul float %5, %0
-  %7 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %8 = extractelement <4 x float> %7, i32 1
   %9 = fmul float %8, %0
-  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %11 = extractelement <4 x float> %10, i32 2
   %12 = fmul float %11, %0
-  %13 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
   %14 = extractelement <4 x float> %13, i32 3
   %15 = fmul float %14, %0
-  %16 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
   %17 = extractelement <4 x float> %16, i32 0
   %18 = fmul float %17, %1
   %19 = fadd float %18, %6
-  %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
   %21 = extractelement <4 x float> %20, i32 1
   %22 = fmul float %21, %1
   %23 = fadd float %22, %9
-  %24 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
   %25 = extractelement <4 x float> %24, i32 2
   %26 = fmul float %25, %1
   %27 = fadd float %26, %12
-  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
   %29 = extractelement <4 x float> %28, i32 3
   %30 = fmul float %29, %1
   %31 = fadd float %30, %15
-  %32 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
   %33 = extractelement <4 x float> %32, i32 0
   %34 = fmul float %33, %2
   %35 = fadd float %34, %19
-  %36 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
   %37 = extractelement <4 x float> %36, i32 1
   %38 = fmul float %37, %2
   %39 = fadd float %38, %23
-  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
   %41 = extractelement <4 x float> %40, i32 2
   %42 = fmul float %41, %2
   %43 = fadd float %42, %27
-  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
   %45 = extractelement <4 x float> %44, i32 3
   %46 = fmul float %45, %2
   %47 = fadd float %46, %31
-  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
   %49 = extractelement <4 x float> %48, i32 0
   %50 = fmul float %49, %3
   %51 = fadd float %50, %35
-  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
   %53 = extractelement <4 x float> %52, i32 1
   %54 = fmul float %53, %3
   %55 = fadd float %54, %39
-  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
   %57 = extractelement <4 x float> %56, i32 2
   %58 = fmul float %57, %3
   %59 = fadd float %58, %43
-  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
   %61 = extractelement <4 x float> %60, i32 3
   %62 = fmul float %61, %3
   %63 = fadd float %62, %47
-  %64 = load <4 x float> addrspace(8)* null
+  %64 = load <4 x float>, <4 x float> addrspace(8)* null
   %65 = extractelement <4 x float> %64, i32 0
-  %66 = load <4 x float> addrspace(8)* null
+  %66 = load <4 x float>, <4 x float> addrspace(8)* null
   %67 = extractelement <4 x float> %66, i32 1
-  %68 = load <4 x float> addrspace(8)* null
+  %68 = load <4 x float>, <4 x float> addrspace(8)* null
   %69 = extractelement <4 x float> %68, i32 2
-  %70 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %70 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %71 = extractelement <4 x float> %70, i32 0
-  %72 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %73 = extractelement <4 x float> %72, i32 1
-  %74 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %75 = extractelement <4 x float> %74, i32 2
-  %76 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %77 = extractelement <4 x float> %76, i32 0
-  %78 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %78 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %79 = extractelement <4 x float> %78, i32 1
-  %80 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
   %81 = extractelement <4 x float> %80, i32 2
   %82 = insertelement <4 x float> undef, float %51, i32 0
   %83 = insertelement <4 x float> %82, float %55, i32 1
diff --git a/test/CodeGen/R600/r600cfg.ll b/test/CodeGen/R600/r600cfg.ll
index dddc9de7e963..c7b9d65220f3 100644
--- a/test/CodeGen/R600/r600cfg.ll
+++ b/test/CodeGen/R600/r600cfg.ll
@@ -83,7 +83,7 @@ ELSE45:                                           ; preds = %ENDIF40
 ENDIF43:                                          ; preds = %ELSE45, %IF44
   %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ]
   %52 = bitcast i32 %.sink to float
-  %53 = load <4 x float> addrspace(8)* null
+  %53 = load <4 x float>, <4 x float> addrspace(8)* null
   %54 = extractelement <4 x float> %53, i32 0
   %55 = bitcast float %54 to i32
   br label %LOOP47
diff --git a/test/CodeGen/R600/register-count-comments.ll b/test/CodeGen/R600/register-count-comments.ll
index 2b49f977def7..de6bfb310883 100644
--- a/test/CodeGen/R600/register-count-comments.ll
+++ b/test/CodeGen/R600/register-count-comments.ll
@@ -9,11 +9,11 @@ declare i32 @llvm.SI.tid() nounwind readnone
 ; SI: ; NumVgprs: {{[0-9]+}}
 define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
   %tid = call i32 @llvm.SI.tid() nounwind readnone
-  %aptr = getelementptr i32 addrspace(1)* %abase, i32 %tid
-  %bptr = getelementptr i32 addrspace(1)* %bbase, i32 %tid
-  %outptr = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %a = load i32 addrspace(1)* %aptr, align 4
-  %b = load i32 addrspace(1)* %bptr, align 4
+  %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
+  %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid
+  %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %result = add i32 %a, %b
   store i32 %result, i32 addrspace(1)* %outptr, align 4
   ret void
diff --git a/test/CodeGen/R600/reorder-stores.ll b/test/CodeGen/R600/reorder-stores.ll
index ea50d5eed4df..187650ff9a53 100644
--- a/test/CodeGen/R600/reorder-stores.ll
+++ b/test/CodeGen/R600/reorder-stores.ll
@@ -12,8 +12,8 @@
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
-  %tmp1 = load <2 x double> addrspace(1)* %x, align 16
-  %tmp4 = load <2 x double> addrspace(1)* %y, align 16
+  %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
+  %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
   store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
   ret void
@@ -26,8 +26,8 @@ define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocap
 ; SI: ds_write_b64
 ; SI: s_endpgm
 define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
-  %tmp1 = load <2 x double> addrspace(3)* %x, align 16
-  %tmp4 = load <2 x double> addrspace(3)* %y, align 16
+  %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
+  %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
   store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
   ret void
@@ -76,8 +76,8 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
-  %tmp1 = load <8 x i32> addrspace(1)* %x, align 32
-  %tmp4 = load <8 x i32> addrspace(1)* %y, align 32
+  %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
+  %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
   store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
   store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
   ret void
@@ -91,8 +91,8 @@ define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* no
 ; SI: ds_write_b64
 ; SI: s_endpgm
 define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
-  %tmp1 = load <2 x i32> addrspace(3)* %x, align 8
-  %tmp4 = load <2 x i32> addrspace(3)* %y, align 8
+  %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
+  %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8
   %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
   %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
   %tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
diff --git a/test/CodeGen/R600/rotl.i64.ll b/test/CodeGen/R600/rotl.i64.ll
index 6da17a4fea93..3f4ceb7e0310 100644
--- a/test/CodeGen/R600/rotl.i64.ll
+++ b/test/CodeGen/R600/rotl.i64.ll
@@ -28,8 +28,8 @@ entry:
 ; BOTH: s_endpgm
 define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
-  %x = load i64 addrspace(1)* %xptr, align 8
-  %y = load i64 addrspace(1)* %yptr, align 8
+  %x = load i64, i64 addrspace(1)* %xptr, align 8
+  %y = load i64, i64 addrspace(1)* %yptr, align 8
   %tmp0 = shl i64 %x, %y
   %tmp1 = sub i64 64, %y
   %tmp2 = lshr i64 %x, %tmp1
diff --git a/test/CodeGen/R600/rotr.i64.ll b/test/CodeGen/R600/rotr.i64.ll
index f1d1d265f366..586de44a566c 100644
--- a/test/CodeGen/R600/rotr.i64.ll
+++ b/test/CodeGen/R600/rotr.i64.ll
@@ -26,8 +26,8 @@ entry:
 ; BOTH: v_or_b32
 define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
-  %x = load i64 addrspace(1)* %xptr, align 8
-  %y = load i64 addrspace(1)* %yptr, align 8
+  %x = load i64, i64 addrspace(1)* %xptr, align 8
+  %y = load i64, i64 addrspace(1)* %yptr, align 8
   %tmp0 = sub i64 64, %y
   %tmp1 = shl i64 %x, %tmp0
   %tmp2 = lshr i64 %x, %y
@@ -50,8 +50,8 @@ entry:
 ; BOTH-LABEL: {{^}}v_rotr_v2i64:
 define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
 entry:
-  %x = load <2 x i64> addrspace(1)* %xptr, align 8
-  %y = load <2 x i64> addrspace(1)* %yptr, align 8
+  %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8
+  %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
   %tmp1 = shl <2 x i64> %x, %tmp0
   %tmp2 = lshr <2 x i64> %x, %y
diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll
index b8a23df63d83..b67b800c7374 100644
--- a/test/CodeGen/R600/rsq.ll
+++ b/test/CodeGen/R600/rsq.ll
@@ -9,7 +9,7 @@ declare double @llvm.sqrt.f64(double) nounwind readnone
 ; SI: v_rsq_f32_e32
 ; SI: s_endpgm
 define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %val = load float addrspace(1)* %in, align 4
+  %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv float 1.0, %sqrt
   store float %div, float addrspace(1)* %out, align 4
@@ -21,7 +21,7 @@ define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noali
 ; SI-SAFE: v_sqrt_f64_e32
 ; SI: s_endpgm
 define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
-  %val = load double addrspace(1)* %in, align 4
+  %val = load double, double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
   %div = fdiv double 1.0, %sqrt
   store double %div, double addrspace(1)* %out, align 4
@@ -57,14 +57,14 @@ define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind
 ; SI: s_endpgm
 define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
-  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
 
-  %a = load float addrspace(1)* %gep.0
-  %b = load float addrspace(1)* %gep.1
-  %c = load float addrspace(1)* %gep.2
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
 
   %x = call float @llvm.sqrt.f32(float %a)
   %y = fmul float %x, %b
diff --git a/test/CodeGen/R600/s_movk_i32.ll b/test/CodeGen/R600/s_movk_i32.ll
index 8be2d1d923cc..6b1a36c979c2 100644
--- a/test/CodeGen/R600/s_movk_i32.ll
+++ b/test/CodeGen/R600/s_movk_i32.ll
@@ -9,7 +9,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@ define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -37,7 +37,7 @@ define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -51,7 +51,7 @@ define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -79,7 +79,7 @@ define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -93,7 +93,7 @@ define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -107,7 +107,7 @@ define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -122,7 +122,7 @@ define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -136,7 +136,7 @@ define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -150,7 +150,7 @@ define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -164,7 +164,7 @@ define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@ define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
 define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64 addrspace(1)* %a, align 4
+  %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
   store i64 %or, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/saddo.ll b/test/CodeGen/R600/saddo.ll
index 8e625c1110a6..f8ced7942a60 100644
--- a/test/CodeGen/R600/saddo.ll
+++ b/test/CodeGen/R600/saddo.ll
@@ -28,8 +28,8 @@ define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; FUNC-LABEL: {{^}}v_saddo_i32:
 define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32 addrspace(1)* %aptr, align 4
-  %b = load i32 addrspace(1)* %bptr, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
   %carry = extractvalue { i32, i1 } %sadd, 1
@@ -52,8 +52,8 @@ define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; SI: v_add_i32
 ; SI: v_addc_u32
 define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64 addrspace(1)* %aptr, align 4
-  %b = load i64 addrspace(1)* %bptr, align 4
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
index dfb181da1801..0b9649576545 100644
--- a/test/CodeGen/R600/salu-to-valu.ll
+++ b/test/CodeGen/R600/salu-to-valu.ll
@@ -27,11 +27,11 @@ entry:
 loop:
   %4 = phi i64 [0, %entry], [%5, %loop]
   %5 = add i64 %2, %4
-  %6 = getelementptr i8 addrspace(1)* %in, i64 %5
-  %7 = load i8 addrspace(1)* %6, align 1
+  %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5
+  %7 = load i8, i8 addrspace(1)* %6, align 1
   %8 = or i64 %5, 1
-  %9 = getelementptr i8 addrspace(1)* %in, i64 %8
-  %10 = load i8 addrspace(1)* %9, align 1
+  %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8
+  %10 = load i8, i8 addrspace(1)* %9, align 1
   %11 = add i8 %7, %10
   %12 = sext i8 %11 to i32
   store i32 %12, i32 addrspace(1)* %out
@@ -59,18 +59,18 @@ entry:
   br i1 %0, label %if, label %else
 
 if:
-  %1 = load i32 addrspace(2)* addrspace(1)* %in
+  %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
   br label %endif
 
 else:
-  %2 = getelementptr i32 addrspace(2)* addrspace(1)* %in
-  %3 = load i32 addrspace(2)* addrspace(1)* %2
+  %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
+  %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2
   br label %endif
 
 endif:
   %4 = phi i32 addrspace(2)*  [%1, %if], [%3, %else]
-  %5 = getelementptr i32 addrspace(2)* %4, i32 3000
-  %6 = load i32 addrspace(2)* %5
+  %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000
+  %6 = load i32, i32 addrspace(2)* %5
   store i32 %6, i32 addrspace(1)* %out
   ret void
 }
@@ -83,8 +83,8 @@ define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %1 = add i32 %0, 4
-  %2 = getelementptr [8 x i32] addrspace(2)* %in, i32 %0, i32 4
-  %3 = load i32 addrspace(2)* %2
+  %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4
+  %3 = load i32, i32 addrspace(2)* %2
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
@@ -95,9 +95,9 @@ entry:
 define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
 entry:
   %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
-  %tmp1 = getelementptr inbounds i32 addrspace(2)* %in, i32 %tmp0
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
-  %tmp3 = load <8 x i32> addrspace(2)* %tmp2, align 4
+  %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
   store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
   ret void
 }
@@ -110,9 +110,9 @@ entry:
 define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
 entry:
   %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
-  %tmp1 = getelementptr inbounds i32 addrspace(2)* %in, i32 %tmp0
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
-  %tmp3 = load <16 x i32> addrspace(2)* %tmp2, align 4
+  %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
   store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
   ret void
 }
diff --git a/test/CodeGen/R600/scalar_to_vector.ll b/test/CodeGen/R600/scalar_to_vector.ll
index b82e5526f751..0970e5d30630 100644
--- a/test/CodeGen/R600/scalar_to_vector.ll
+++ b/test/CodeGen/R600/scalar_to_vector.ll
@@ -11,7 +11,7 @@
 ; SI: buffer_store_short [[RESULT]]
 ; SI: s_endpgm
 define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %tmp1 = load i32 addrspace(1)* %in, align 4
+  %tmp1 = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
@@ -27,7 +27,7 @@ define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(
 ; SI: buffer_store_short [[RESULT]]
 ; SI: s_endpgm
 define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %tmp1 = load float addrspace(1)* %in, align 4
+  %tmp1 = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
@@ -39,7 +39,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 
 
 ; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-;   %tmp1 = load i32 addrspace(1)* %in, align 4
+;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
 ;   %bc = bitcast i32 %tmp1 to <4 x i8>
 
 ;   %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
diff --git a/test/CodeGen/R600/schedule-fs-loop-nested.ll b/test/CodeGen/R600/schedule-fs-loop-nested.ll
index b917ec6413e9..759197ca61f7 100644
--- a/test/CodeGen/R600/schedule-fs-loop-nested.ll
+++ b/test/CodeGen/R600/schedule-fs-loop-nested.ll
@@ -3,7 +3,7 @@
 
 define void @main() {
 main_body:
-  %0 = load <4 x float> addrspace(9)* null
+  %0 = load <4 x float>, <4 x float> addrspace(9)* null
   %1 = extractelement <4 x float> %0, i32 3
   %2 = fptosi float %1 to i32
   %3 = bitcast i32 %2 to float
@@ -20,11 +20,11 @@ main_body:
   %14 = bitcast float %12 to i32
   %15 = add i32 %13, %14
   %16 = bitcast i32 %15 to float
-  %17 = load <4 x float> addrspace(9)* null
+  %17 = load <4 x float>, <4 x float> addrspace(9)* null
   %18 = extractelement <4 x float> %17, i32 0
-  %19 = load <4 x float> addrspace(9)* null
+  %19 = load <4 x float>, <4 x float> addrspace(9)* null
   %20 = extractelement <4 x float> %19, i32 1
-  %21 = load <4 x float> addrspace(9)* null
+  %21 = load <4 x float>, <4 x float> addrspace(9)* null
   %22 = extractelement <4 x float> %21, i32 2
   br label %LOOP
 
diff --git a/test/CodeGen/R600/schedule-fs-loop.ll b/test/CodeGen/R600/schedule-fs-loop.ll
index d6c194b19b27..28cc08abc022 100644
--- a/test/CodeGen/R600/schedule-fs-loop.ll
+++ b/test/CodeGen/R600/schedule-fs-loop.ll
@@ -3,15 +3,15 @@
 
 define void @main() {
 main_body:
-  %0 = load <4 x float> addrspace(9)* null
+  %0 = load <4 x float>, <4 x float> addrspace(9)* null
   %1 = extractelement <4 x float> %0, i32 3
   %2 = fptosi float %1 to i32
   %3 = bitcast i32 %2 to float
-  %4 = load <4 x float> addrspace(9)* null
+  %4 = load <4 x float>, <4 x float> addrspace(9)* null
   %5 = extractelement <4 x float> %4, i32 0
-  %6 = load <4 x float> addrspace(9)* null
+  %6 = load <4 x float>, <4 x float> addrspace(9)* null
   %7 = extractelement <4 x float> %6, i32 1
-  %8 = load <4 x float> addrspace(9)* null
+  %8 = load <4 x float>, <4 x float> addrspace(9)* null
   %9 = extractelement <4 x float> %8, i32 2
   br label %LOOP
 
diff --git a/test/CodeGen/R600/schedule-global-loads.ll b/test/CodeGen/R600/schedule-global-loads.ll
index b6437d25b8cb..3f728fd873b3 100644
--- a/test/CodeGen/R600/schedule-global-loads.ll
+++ b/test/CodeGen/R600/schedule-global-loads.ll
@@ -14,9 +14,9 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; SI: buffer_store_dword [[REG0]]
 ; SI: buffer_store_dword [[REG1]]
 define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
-  %load0 = load i32 addrspace(1)* %ptr, align 4
-  %gep = getelementptr i32 addrspace(1)* %ptr, i32 1
-  %load1 = load i32 addrspace(1)* %gep, align 4
+  %load0 = load i32, i32 addrspace(1)* %ptr, align 4
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1
+  %load1 = load i32, i32 addrspace(1)* %gep, align 4
   store i32 %load0, i32 addrspace(1)* %out0, align 4
   store i32 %load1, i32 addrspace(1)* %out1, align 4
   ret void
@@ -29,9 +29,9 @@ define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)*
 ; SI: buffer_load_dword
 define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
-  %out1 = getelementptr i32 addrspace(1)* %out, i32 %offset
-  %tmp0 = load i32 addrspace(1)* %out
-  %tmp1 = load i32 addrspace(1)* %out1
+  %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
+  %tmp0 = load i32, i32 addrspace(1)* %out
+  %tmp1 = load i32, i32 addrspace(1)* %out1
   %tmp2 = add i32 %tmp0, %tmp1
   store i32 %tmp2, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/schedule-if-2.ll b/test/CodeGen/R600/schedule-if-2.ll
index 38aad1850f81..549465096833 100644
--- a/test/CodeGen/R600/schedule-if-2.ll
+++ b/test/CodeGen/R600/schedule-if-2.ll
@@ -3,10 +3,10 @@
 
 define void @main() {
 main_body:
-  %0 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %1 = extractelement <4 x float> %0, i32 0
   %2 = fadd float 1.000000e+03, %1
-  %3 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %4 = extractelement <4 x float> %3, i32 0
   %5 = bitcast float %4 to i32
   %6 = icmp eq i32 %5, 0
@@ -47,7 +47,7 @@ IF:                                               ; preds = %main_body
   br label %ENDIF
 
 ELSE:                                             ; preds = %main_body
-  %36 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %37 = extractelement <4 x float> %36, i32 0
   %38 = bitcast float %37 to i32
   %39 = icmp eq i32 %38, 1
@@ -80,7 +80,7 @@ IF23:                                             ; preds = %ELSE
   %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00
   %55 = bitcast float %.28 to i32
   %56 = sitofp i32 %55 to float
-  %57 = load <4 x float> addrspace(8)* null
+  %57 = load <4 x float>, <4 x float> addrspace(8)* null
   %58 = extractelement <4 x float> %57, i32 0
   %59 = fsub float -0.000000e+00, %58
   %60 = fadd float %2, %59
diff --git a/test/CodeGen/R600/schedule-if.ll b/test/CodeGen/R600/schedule-if.ll
index f960c9323940..94c653c8f25b 100644
--- a/test/CodeGen/R600/schedule-if.ll
+++ b/test/CodeGen/R600/schedule-if.ll
@@ -3,7 +3,7 @@
 
 define void @main() {
 main_body:
-  %0 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %1 = extractelement <4 x float> %0, i32 0
   %2 = bitcast float %1 to i32
   %3 = icmp eq i32 %2, 0
@@ -14,7 +14,7 @@ main_body:
   br i1 %7, label %ENDIF, label %ELSE
 
 ELSE:                                             ; preds = %main_body
-  %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %9 = extractelement <4 x float> %8, i32 0
   %10 = bitcast float %9 to i32
   %11 = icmp eq i32 %10, 1
@@ -36,7 +36,7 @@ ENDIF:                                            ; preds = %IF13, %ELSE, %main_
   ret void
 
 IF13:                                             ; preds = %ELSE
-  %20 = load <4 x float> addrspace(8)* null
+  %20 = load <4 x float>, <4 x float> addrspace(8)* null
   %21 = extractelement <4 x float> %20, i32 0
   %22 = fsub float -0.000000e+00, %21
   %23 = fadd float 1.000000e+03, %22
diff --git a/test/CodeGen/R600/schedule-kernel-arg-loads.ll b/test/CodeGen/R600/schedule-kernel-arg-loads.ll
index 01d897ff18cb..6b3e0814c380 100644
--- a/test/CodeGen/R600/schedule-kernel-arg-loads.ll
+++ b/test/CodeGen/R600/schedule-kernel-arg-loads.ll
@@ -6,6 +6,13 @@
 ; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
+; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
 define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
   store i32 %x, i32 addrspace(1)* %out0, align 4
   store i32 %y, i32 addrspace(1)* %out1, align 4
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
index 76b655d712d0..3863afda5dd3 100644
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
@@ -39,63 +39,63 @@ ENDIF:                                            ; preds = %main_body, %Flow2
   %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
   %15 = extractelement <4 x float> %reg1, i32 1
   %16 = extractelement <4 x float> %reg1, i32 3
-  %17 = load <4 x float> addrspace(9)* null
+  %17 = load <4 x float>, <4 x float> addrspace(9)* null
   %18 = extractelement <4 x float> %17, i32 0
   %19 = fmul float %18, %0
-  %20 = load <4 x float> addrspace(9)* null
+  %20 = load <4 x float>, <4 x float> addrspace(9)* null
   %21 = extractelement <4 x float> %20, i32 1
   %22 = fmul float %21, %0
-  %23 = load <4 x float> addrspace(9)* null
+  %23 = load <4 x float>, <4 x float> addrspace(9)* null
   %24 = extractelement <4 x float> %23, i32 2
   %25 = fmul float %24, %0
-  %26 = load <4 x float> addrspace(9)* null
+  %26 = load <4 x float>, <4 x float> addrspace(9)* null
   %27 = extractelement <4 x float> %26, i32 3
   %28 = fmul float %27, %0
-  %29 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
   %30 = extractelement <4 x float> %29, i32 0
   %31 = fmul float %30, %15
   %32 = fadd float %31, %19
-  %33 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
   %34 = extractelement <4 x float> %33, i32 1
   %35 = fmul float %34, %15
   %36 = fadd float %35, %22
-  %37 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
   %38 = extractelement <4 x float> %37, i32 2
   %39 = fmul float %38, %15
   %40 = fadd float %39, %25
-  %41 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
   %42 = extractelement <4 x float> %41, i32 3
   %43 = fmul float %42, %15
   %44 = fadd float %43, %28
-  %45 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
   %46 = extractelement <4 x float> %45, i32 0
   %47 = fmul float %46, %1
   %48 = fadd float %47, %32
-  %49 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
   %50 = extractelement <4 x float> %49, i32 1
   %51 = fmul float %50, %1
   %52 = fadd float %51, %36
-  %53 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
   %54 = extractelement <4 x float> %53, i32 2
   %55 = fmul float %54, %1
   %56 = fadd float %55, %40
-  %57 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
   %58 = extractelement <4 x float> %57, i32 3
   %59 = fmul float %58, %1
   %60 = fadd float %59, %44
-  %61 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
   %62 = extractelement <4 x float> %61, i32 0
   %63 = fmul float %62, %16
   %64 = fadd float %63, %48
-  %65 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
   %66 = extractelement <4 x float> %65, i32 1
   %67 = fmul float %66, %16
   %68 = fadd float %67, %52
-  %69 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
   %70 = extractelement <4 x float> %69, i32 2
   %71 = fmul float %70, %16
   %72 = fadd float %71, %56
-  %73 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
   %74 = extractelement <4 x float> %73, i32 3
   %75 = fmul float %74, %16
   %76 = fadd float %75, %60
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
index 33b20d36737b..8d980dbf8995 100644
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
@@ -21,63 +21,63 @@ ENDIF:                                            ; preds = %ENDIF16, %LOOP, %ma
   %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ]
   %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ]
   %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ]
-  %11 = load <4 x float> addrspace(9)* null
+  %11 = load <4 x float>, <4 x float> addrspace(9)* null
   %12 = extractelement <4 x float> %11, i32 0
   %13 = fmul float %12, %0
-  %14 = load <4 x float> addrspace(9)* null
+  %14 = load <4 x float>, <4 x float> addrspace(9)* null
   %15 = extractelement <4 x float> %14, i32 1
   %16 = fmul float %15, %0
-  %17 = load <4 x float> addrspace(9)* null
+  %17 = load <4 x float>, <4 x float> addrspace(9)* null
   %18 = extractelement <4 x float> %17, i32 2
   %19 = fmul float %18, %0
-  %20 = load <4 x float> addrspace(9)* null
+  %20 = load <4 x float>, <4 x float> addrspace(9)* null
   %21 = extractelement <4 x float> %20, i32 3
   %22 = fmul float %21, %0
-  %23 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %23 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
   %24 = extractelement <4 x float> %23, i32 0
   %25 = fmul float %24, %1
   %26 = fadd float %25, %13
-  %27 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %27 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
   %28 = extractelement <4 x float> %27, i32 1
   %29 = fmul float %28, %1
   %30 = fadd float %29, %16
-  %31 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %31 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
   %32 = extractelement <4 x float> %31, i32 2
   %33 = fmul float %32, %1
   %34 = fadd float %33, %19
-  %35 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %35 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
   %36 = extractelement <4 x float> %35, i32 3
   %37 = fmul float %36, %1
   %38 = fadd float %37, %22
-  %39 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %39 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
   %40 = extractelement <4 x float> %39, i32 0
   %41 = fmul float %40, %2
   %42 = fadd float %41, %26
-  %43 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %43 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
   %44 = extractelement <4 x float> %43, i32 1
   %45 = fmul float %44, %2
   %46 = fadd float %45, %30
-  %47 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %47 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
   %48 = extractelement <4 x float> %47, i32 2
   %49 = fmul float %48, %2
   %50 = fadd float %49, %34
-  %51 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %51 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
   %52 = extractelement <4 x float> %51, i32 3
   %53 = fmul float %52, %2
   %54 = fadd float %53, %38
-  %55 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %55 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
   %56 = extractelement <4 x float> %55, i32 0
   %57 = fmul float %56, %3
   %58 = fadd float %57, %42
-  %59 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %59 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
   %60 = extractelement <4 x float> %59, i32 1
   %61 = fmul float %60, %3
   %62 = fadd float %61, %46
-  %63 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %63 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
   %64 = extractelement <4 x float> %63, i32 2
   %65 = fmul float %64, %3
   %66 = fadd float %65, %50
-  %67 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %67 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
   %68 = extractelement <4 x float> %67, i32 3
   %69 = fmul float %68, %3
   %70 = fadd float %69, %54
diff --git a/test/CodeGen/R600/scratch-buffer.ll b/test/CodeGen/R600/scratch-buffer.ll
index 740328a495da..56088718ada8 100644
--- a/test/CodeGen/R600/scratch-buffer.ll
+++ b/test/CodeGen/R600/scratch-buffer.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; When a frame index offset is more than 12-bits, make sure we don't store
 ; it in mubuf's offset field.
@@ -18,23 +19,23 @@ entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
 
-  %scratchptr0 = getelementptr [8192 x i32]* %scratch0, i32 0, i32 0
+  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 0
   store i32 1, i32* %scratchptr0
 
-  %scratchptr1 = getelementptr [8192 x i32]* %scratch1, i32 0, i32 0
+  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 0
   store i32 2, i32* %scratchptr1
 
   %cmp = icmp eq i32 %cond, 0
   br i1 %cmp, label %if, label %else
 
 if:
-  %if_ptr = getelementptr [8192 x i32]* %scratch0, i32 0, i32 %if_offset
-  %if_value = load i32* %if_ptr
+  %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32, i32* %if_ptr
   br label %done
 
 else:
-  %else_ptr = getelementptr [8192 x i32]* %scratch1, i32 0, i32 %else_offset
-  %else_value = load i32* %else_ptr
+  %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32, i32* %else_ptr
   br label %done
 
 done:
@@ -56,26 +57,26 @@ entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
 
-  %offset0 = load i32 addrspace(1)* %offsets
-  %scratchptr0 = getelementptr [8192 x i32]* %scratch0, i32 0, i32 %offset0
+  %offset0 = load i32, i32 addrspace(1)* %offsets
+  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %offset0
   store i32 %offset0, i32* %scratchptr0
 
-  %offsetptr1 = getelementptr i32 addrspace(1)* %offsets, i32 1
-  %offset1 = load i32 addrspace(1)* %offsetptr1
-  %scratchptr1 = getelementptr [8192 x i32]* %scratch1, i32 0, i32 %offset1
+  %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1
+  %offset1 = load i32, i32 addrspace(1)* %offsetptr1
+  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %offset1
   store i32 %offset1, i32* %scratchptr1
 
   %cmp = icmp eq i32 %cond, 0
   br i1 %cmp, label %if, label %else
 
 if:
-  %if_ptr = getelementptr [8192 x i32]* %scratch0, i32 0, i32 %if_offset
-  %if_value = load i32* %if_ptr
+  %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32, i32* %if_ptr
   br label %done
 
 else:
-  %else_ptr = getelementptr [8192 x i32]* %scratch1, i32 0, i32 %else_offset
-  %else_value = load i32* %else_ptr
+  %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32, i32* %else_ptr
   br label %done
 
 done:
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
index 07bb41768ee3..de645353a401 100644
--- a/test/CodeGen/R600/sdiv.ll
+++ b/test/CodeGen/R600/sdiv.ll
@@ -14,9 +14,9 @@
 ; FUNC-LABEL: {{^}}sdiv_i32:
 ; EG: CF_END
 define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in
-  %den = load i32 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in
+  %den = load i32, i32 addrspace(1) * %den_ptr
   %result = sdiv i32 %num, %den
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}sdiv_i32_4:
 define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32 addrspace(1) * %in
+  %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -44,39 +44,39 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: buffer_store_dword
 ; SI: s_endpgm
 define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32 addrspace(1) * %in
+  %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 3435
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
 define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %num = load <2 x i32> addrspace(1) * %in
-  %den = load <2 x i32> addrspace(1) * %den_ptr
+  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
   %result = sdiv <2 x i32> %num, %den
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
 define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %num = load <2 x i32> addrspace(1) * %in
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = sdiv <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
 define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %num = load <4 x i32> addrspace(1) * %in
-  %den = load <4 x i32> addrspace(1) * %den_ptr
+  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
   %result = sdiv <4 x i32> %num, %den
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
 define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %num = load <4 x i32> addrspace(1) * %in
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/sdivrem24.ll b/test/CodeGen/R600/sdivrem24.ll
index e8c5c252bd72..ad5df39f5505 100644
--- a/test/CodeGen/R600/sdivrem24.ll
+++ b/test/CodeGen/R600/sdivrem24.ll
@@ -13,9 +13,9 @@
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
 define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
-  %num = load i8 addrspace(1) * %in
-  %den = load i8 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
   %result = sdiv i8 %num, %den
   store i8 %result, i8 addrspace(1)* %out
   ret void
@@ -32,9 +32,9 @@ define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
 define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
-  %num = load i16 addrspace(1) * %in, align 2
-  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
   %result = sdiv i16 %num, %den
   store i16 %result, i16 addrspace(1)* %out, align 2
   ret void
@@ -51,9 +51,9 @@ define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
 define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = ashr i32 %num.i24.0, 8
@@ -70,9 +70,9 @@ define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 7
@@ -89,9 +89,9 @@ define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 8
@@ -108,9 +108,9 @@ define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = ashr i32 %num.i24.0, 7
@@ -131,9 +131,9 @@ define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
 define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
-  %num = load i8 addrspace(1) * %in
-  %den = load i8 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
   %result = srem i8 %num, %den
   store i8 %result, i8 addrspace(1)* %out
   ret void
@@ -150,9 +150,9 @@ define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
 define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
-  %num = load i16 addrspace(1) * %in, align 2
-  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
   %result = srem i16 %num, %den
   store i16 %result, i16 addrspace(1)* %out, align 2
   ret void
@@ -169,9 +169,9 @@ define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
 define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = ashr i32 %num.i24.0, 8
@@ -188,9 +188,9 @@ define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 7
@@ -207,9 +207,9 @@ define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 8
@@ -226,9 +226,9 @@ define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = ashr i32 %num.i24.0, 7
diff --git a/test/CodeGen/R600/sdivrem64.ll b/test/CodeGen/R600/sdivrem64.ll
new file mode 100644
index 000000000000..a9b2b7f9df55
--- /dev/null
+++ b/test/CodeGen/R600/sdivrem64.ll
@@ -0,0 +1,225 @@
+;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test_sdiv:
+;EG: RECIP_UINT
+;EG: LSHL {{.*}}, 1,
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = sdiv i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem:
+;EG: RECIP_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: AND_INT {{.*}}, 1,
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = urem i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_sdiv3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 33
+  %2 = ashr i64 %y, 33
+  %result = sdiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 33
+  %2 = ashr i64 %y, 33
+  %result = srem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_sdiv2464:
+;EG: INT_TO_FLT
+;EG: INT_TO_FLT
+;EG: FLT_TO_INT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 40
+  %2 = ashr i64 %y, 40
+  %result = sdiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem2464:
+;EG: INT_TO_FLT
+;EG: INT_TO_FLT
+;EG: FLT_TO_INT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 40
+  %2 = ashr i64 %y, 40
+  %result = srem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/select64.ll b/test/CodeGen/R600/select64.ll
index 3fd648139fe2..5cebb30dc72e 100644
--- a/test/CodeGen/R600/select64.ll
+++ b/test/CodeGen/R600/select64.ll
@@ -42,10 +42,27 @@ define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %
 ; CHECK-NOT: v_cndmask_b32
 define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
-  %a = load i64 addrspace(1)* %aptr, align 8
-  %b = load i64 addrspace(1)* %bptr, align 8
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
   %sel = select i1 %cmp, i64 %a, i64 %b
   %trunc = trunc i64 %sel to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; CHECK-LABEL: {{^}}v_select_i64_split_imm:
+; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63
+; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0
+; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]]
+; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]]
+; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
+; CHECK: s_endpgm
+define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32
+  store i64 %sel, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc-cnd.ll b/test/CodeGen/R600/selectcc-cnd.ll
index 0bfca6937488..94d0ace75697 100644
--- a/test/CodeGen/R600/selectcc-cnd.ll
+++ b/test/CodeGen/R600/selectcc-cnd.ll
@@ -4,7 +4,7 @@
 ;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
 ;CHECK: 1073741824
 define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %1 = load float addrspace(1)* %in
+  %1 = load float, float addrspace(1)* %in
   %2 = fcmp oeq float %1, 0.0
   %3 = select i1 %2, float 1.0, float 2.0
   store float %3, float addrspace(1)* %out
diff --git a/test/CodeGen/R600/selectcc-cnde-int.ll b/test/CodeGen/R600/selectcc-cnde-int.ll
index d568888f7cb2..58a4ee7d62b2 100644
--- a/test/CodeGen/R600/selectcc-cnde-int.ll
+++ b/test/CodeGen/R600/selectcc-cnde-int.ll
@@ -4,7 +4,7 @@
 ;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
 ;CHECK-NEXT: 2
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %1 = load i32 addrspace(1)* %in
+  %1 = load i32, i32 addrspace(1)* %in
   %2 = icmp eq i32 %1, 0
   %3 = select i1 %2, i32 1, i32 2
   store i32 %3, i32 addrspace(1)* %out
diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll b/test/CodeGen/R600/selectcc-icmp-select-float.ll
index 6743800490b3..e870ee891e66 100644
--- a/test/CodeGen/R600/selectcc-icmp-select-float.ll
+++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
@@ -8,7 +8,7 @@
 
 define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %0 = load i32 addrspace(1)* %in
+  %0 = load i32, i32 addrspace(1)* %in
   %1 = icmp sge i32 %0, 0
   %2 = select i1 %1, float 1.0, float 0.0
   store float %2, float addrspace(1)* %out
diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll
index 7780371329ce..65be4a626a18 100644
--- a/test/CodeGen/R600/selectcc-opt.ll
+++ b/test/CodeGen/R600/selectcc-opt.ll
@@ -19,7 +19,7 @@ entry:
   br i1 %6, label %IF, label %ENDIF
 
 IF:
-  %7 = getelementptr i32 addrspace(1)* %out, i32 1
+  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   store i32 0, i32 addrspace(1)* %7
   br label %ENDIF
 
@@ -47,7 +47,7 @@ entry:
   br i1 %6, label %ENDIF, label %IF
 
 IF:
-  %7 = getelementptr i32 addrspace(1)* %out, i32 1
+  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   store i32 0, i32 addrspace(1)* %7
   br label %ENDIF
 
diff --git a/test/CodeGen/R600/setcc-opt.ll b/test/CodeGen/R600/setcc-opt.ll
index a44c89f72cf5..4e6a10d6b78d 100644
--- a/test/CodeGen/R600/setcc-opt.ll
+++ b/test/CodeGen/R600/setcc-opt.ll
@@ -1,12 +1,13 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
-; SI-NOT: v_cmp
-; SI: v_cmp_ne_i32_e32 vcc,
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI-NEXT:buffer_store_byte [[RESULT]]
-; SI-NEXT: s_endpgm
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT:buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
@@ -19,11 +20,11 @@ define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0:
-; SI-NOT: v_cmp
-; SI: v_cmp_ne_i32_e32 vcc,
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI-NEXT: s_endpgm
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
@@ -37,12 +38,12 @@ define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 
 ; This really folds away to false
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
-; SI: v_cmp_eq_i32_e32 vcc,
-; SI-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: v_cmp_eq_i32_e64 {{s\[[0-9]+:[0-9]+\]}}, [[TMP]], 1{{$}}
-; SI-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
-; SI-NEXT: buffer_store_byte [[TMP]]
-; SI-NEXT: s_endpgm
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
+; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
 define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
@@ -53,12 +54,12 @@ define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 
 ; This really folds away to true
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
-; SI: v_cmp_ne_i32_e32 vcc,
-; SI-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: v_cmp_ne_i32_e64 {{s\[[0-9]+:[0-9]+\]}}, [[TMP]], 1{{$}}
-; SI-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
-; SI-NEXT: buffer_store_byte [[TMP]]
-; SI-NEXT: s_endpgm
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
+; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
+; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
 define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
@@ -68,11 +69,11 @@ define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0:
-; SI-NOT: v_cmp
-; SI: v_cmp_ne_i32_e32 vcc,
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI-NEXT: s_endpgm
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
 define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
@@ -82,11 +83,11 @@ define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0:
-; SI-NOT: v_cmp
-; SI: v_cmp_ne_i32_e32 vcc,
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI-NEXT: s_endpgm
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
 define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
@@ -96,11 +97,11 @@ define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1:
-; SI-NOT: v_cmp
-; SI: v_cmp_eq_i32_e32 vcc,
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI-NEXT: s_endpgm
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
 define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
@@ -110,10 +111,10 @@ define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1:
-; SI-NOT: v_cmp
-; SI: v_cmp_eq_i32_e32 vcc,
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
 define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
@@ -125,11 +126,13 @@ define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
 ; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
-; SI: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[VB]], 2{{$}}
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
-; SI: buffer_store_byte
-; SI: s_endpgm
+; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}}
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: buffer_store_byte
+; GCN: s_endpgm
 define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
@@ -139,12 +142,12 @@ define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}cmp_zext_k_i8max:
-; SI: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; SI: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}}
-; SI: v_cmp_ne_i32_e32 vcc, [[B]], [[K255]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI: s_endpgm
+; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}}
+; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]]
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
 define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, 255
@@ -153,13 +156,13 @@ define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}cmp_sext_k_neg1:
-; SI: buffer_load_sbyte [[B:v[0-9]+]]
-; SI: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[B]], -1{{$}}
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI: s_endpgm
+; GCN: buffer_load_sbyte [[B:v[0-9]+]]
+; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
 define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
-  %b = load i8 addrspace(1)* %b.ptr
+  %b = load i8, i8 addrspace(1)* %b.ptr
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -167,11 +170,11 @@ define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nou
 }
 
 ; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg:
-; SI: s_load_dword [[B:s[0-9]+]]
-; SI: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[B]], -1{{$}}
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI: s_endpgm
+; GCN: s_load_dword [[B:s[0-9]+]]
+; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -1, [[B]]
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
 define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
@@ -184,12 +187,12 @@ define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) n
 ; Should do a buffer_load_sbyte and compare with -1
 
 ; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg:
-; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]]
-; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}}
-; SI: v_cmp_ne_i32_e32 vcc, [[B]], [[K]]{{$}}
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI: s_endpgm
+; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}}
+; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
 define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
@@ -198,9 +201,9 @@ define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}cmp_zext_k_neg1:
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI: s_endpgm
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
 define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
@@ -209,9 +212,9 @@ define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_k:
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI-NEXT: s_endpgm
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
 define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
@@ -221,9 +224,9 @@ define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_k:
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI-NEXT: buffer_store_byte [[RESULT]]
-; SI-NEXT: s_endpgm
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
 define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index f9c7e4f36128..f33a82df5ffb 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -21,9 +21,9 @@ define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
   %result = icmp eq <4 x i32> %a, %b
   %sext = sext <4 x i1> %result to <4 x i32>
   store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
@@ -344,11 +344,11 @@ entry:
 ; SI: s_endpgm
 define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.a = getelementptr <3 x i32> addrspace(1)* %ptra, i32 %tid
-  %gep.b = getelementptr <3 x i32> addrspace(1)* %ptrb, i32 %tid
-  %gep.out = getelementptr <3 x i32> addrspace(1)* %out, i32 %tid
-  %a = load <3 x i32> addrspace(1)* %gep.a
-  %b = load <3 x i32> addrspace(1)* %gep.b
+  %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep.a
+  %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep.b
   %cmp = icmp eq <3 x i32> %a, %b
   %ext = sext <3 x i1> %cmp to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out
@@ -365,11 +365,11 @@ define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptr
 ; SI: s_endpgm
 define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep.a = getelementptr <3 x i8> addrspace(1)* %ptra, i32 %tid
-  %gep.b = getelementptr <3 x i8> addrspace(1)* %ptrb, i32 %tid
-  %gep.out = getelementptr <3 x i8> addrspace(1)* %out, i32 %tid
-  %a = load <3 x i8> addrspace(1)* %gep.a
-  %b = load <3 x i8> addrspace(1)* %gep.b
+  %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i8>, <3 x i8> addrspace(1)* %gep.a
+  %b = load <3 x i8>, <3 x i8> addrspace(1)* %gep.b
   %cmp = icmp eq <3 x i8> %a, %b
   %ext = sext <3 x i1> %cmp to <3 x i8>
   store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
index 3260179921f9..d9ad4935968d 100644
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ b/test/CodeGen/R600/sext-in-reg.ll
@@ -187,11 +187,11 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
-  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
-  %a = load i64 addrspace(1)* %a.gep, align 8
-  %b = load i64 addrspace(1)* %b.gep, align 8
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
 
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 63
@@ -208,11 +208,11 @@ define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
-  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
-  %a = load i64 addrspace(1)* %a.gep, align 8
-  %b = load i64 addrspace(1)* %b.gep, align 8
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
 
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 56
@@ -229,11 +229,11 @@ define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
-  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
-  %a = load i64 addrspace(1)* %a.gep, align 8
-  %b = load i64 addrspace(1)* %b.gep, align 8
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
 
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 48
@@ -249,11 +249,11 @@ define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}}
 define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
-  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
-  %a = load i64 addrspace(1)* %a.gep, align 8
-  %b = load i64 addrspace(1)* %b.gep, align 8
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
 
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 32
@@ -263,9 +263,9 @@ define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_lshl_b32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6
-; SI: s_ashr_i32 {{s[0-9]+}}, [[REG]], 7
+; SI-NOT: s_lshl
+; SI-NOT: s_ashr
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
@@ -282,10 +282,10 @@ define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a,
 }
 
 ; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
-; SI-DAG: s_lshl_b32 [[REG0:s[0-9]+]], {{s[0-9]}}, 6
-; SI-DAG: s_ashr_i32 {{s[0-9]+}}, [[REG0]], 7
-; SI-DAG: s_lshl_b32 [[REG1:s[0-9]+]], {{s[0-9]}}, 6
-; SI-DAG: s_ashr_i32 {{s[0-9]+}}, [[REG1]], 7
+; SI-NOT: s_lshl
+; SI-NOT: s_ashr
+; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
+; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
 ; SI: s_endpgm
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
@@ -428,8 +428,8 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
 ; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 ; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
-  %loada = load <4 x i32> addrspace(1)* %a, align 16
-  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
   %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
@@ -441,8 +441,8 @@ define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
 ; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
 define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
-  %loada = load <4 x i32> addrspace(1)* %a, align 16
-  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
   %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
@@ -459,7 +459,7 @@ define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; SI: v_bfe_i32
 ; SI: buffer_store_short
 define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
-  %tmp5 = load i8 addrspace(1)* %src, align 1
+  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
   %tmp2 = sext i8 %tmp5 to i32
   %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone
   %tmp4 = trunc i32 %tmp3 to i8
@@ -474,7 +474,7 @@ declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
-  %load = load i32 addrspace(1)* %ptr, align 4
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
@@ -485,7 +485,7 @@ define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwin
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
-  %load = load i32 addrspace(1)* %ptr, align 4
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
   %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
   store i32 %bfe1, i32 addrspace(1)* %out, align 4
@@ -496,7 +496,7 @@ define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwin
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
 ; SI: s_endpgm
 define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
-  %load = load i32 addrspace(1)* %ptr, align 4
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
   %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
   store i32 %bfe1, i32 addrspace(1)* %out, align 4
@@ -509,7 +509,7 @@ define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwi
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
-  %load = load i32 addrspace(1)* %ptr, align 4
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
   %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
   store i32 %bfe1, i32 addrspace(1)* %out, align 4
@@ -545,7 +545,7 @@ define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
-  %load = load i8 addrspace(1)* %ptr, align 1
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
   %shl = shl i32 %bfe, 24
@@ -554,12 +554,12 @@ define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %pt
   ret void
 }
 
-; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:
 ; SI: .text
+; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
-  %load = load i8 addrspace(1)* %ptr, align 1
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
   %shl = shl i32 %bfe, 24
@@ -574,7 +574,7 @@ define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; SI: s_endpgm
 define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
@@ -589,7 +589,7 @@ define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
 ; SI: s_endpgm
 define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30
   %shr = ashr i32 %shl, 30
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
@@ -599,12 +599,13 @@ define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
 ; SI: buffer_load_dword
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI-NOT: v_lshl
+; SI-NOT: v_ashr
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
 ; SI: s_endpgm
 define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32 addrspace(1)* %in, align 4
+  %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30
   %shr = ashr i32 %shl, 30
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
diff --git a/test/CodeGen/R600/sgpr-control-flow.ll b/test/CodeGen/R600/sgpr-control-flow.ll
index f0236acc6daa..38289ced632a 100644
--- a/test/CodeGen/R600/sgpr-control-flow.ll
+++ b/test/CodeGen/R600/sgpr-control-flow.ll
@@ -64,15 +64,15 @@ endif:
 
 ; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
 ; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_lt_i32_e64 [[CMP_IF:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
+; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
 
 ; SI: BB2_1:
 ; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_eq_i32_e64 [[CMP_ELSE:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
+; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
 
-; SI: v_cmp_ne_i32_e64 [[CMP_CMP:s\[[0-9]+:[0-9]+\]]], [[V_CMP]], 0
+; SI: v_cmp_ne_i32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
 ; SI: buffer_store_dword [[RESULT]]
 define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
@@ -82,14 +82,14 @@ entry:
   br i1 %tmp1, label %if, label %else
 
 if:
-  %gep.if = getelementptr i32 addrspace(1)* %a, i32 %tid
-  %a.val = load i32 addrspace(1)* %gep.if
+  %gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid
+  %a.val = load i32, i32 addrspace(1)* %gep.if
   %cmp.if = icmp eq i32 %a.val, 0
   br label %endif
 
 else:
-  %gep.else = getelementptr i32 addrspace(1)* %b, i32 %tid
-  %b.val = load i32 addrspace(1)* %gep.else
+  %gep.else = getelementptr i32, i32 addrspace(1)* %b, i32 %tid
+  %b.val = load i32, i32 addrspace(1)* %gep.else
   %cmp.else = icmp slt i32 %b.val, 0
   br label %endif
 
diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
index 893f5a3c50db..df67fcca22fe 100644
--- a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
@@ -7,7 +7,7 @@
 ; SI-LABEL: {{^}}test_dup_operands:
 ; SI: v_add_i32_e32
 define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
-  %a = load <2 x i32> addrspace(1)* %in
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %lo = extractelement <2 x i32> %a, i32 0
   %hi = extractelement <2 x i32> %a, i32 1
   %add = add i32 %lo, %lo
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
index 57cbadd9239d..b849c4038bc7 100644
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ b/test/CodeGen/R600/sgpr-copy.ll
@@ -9,8 +9,8 @@
 
 define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
   %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
   %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
@@ -33,8 +33,8 @@ ENDIF:                                            ; preds = %main_body, %ELSE
 ; CHECK-LABEL: {{^}}phi2:
 define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
   %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
   %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36)
@@ -50,10 +50,10 @@ main_body:
   %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84)
   %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88)
   %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92)
-  %37 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
-  %38 = load <32 x i8> addrspace(2)* %37, !tbaa !1
-  %39 = getelementptr <16 x i8> addrspace(2)* %1, i32 0
-  %40 = load <16 x i8> addrspace(2)* %39, !tbaa !1
+  %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
+  %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1
+  %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
+  %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1
   %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
   %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
   %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5)
@@ -154,8 +154,8 @@ ENDIF24:                                          ; preds = %ENDIF, %IF25
 
 define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
   %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4)
   %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8)
@@ -236,13 +236,13 @@ declare i32 @llvm.SI.packf16(float, float) #1
 define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 
 entry:
-  %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
-  %22 = load <16 x i8> addrspace(2)* %21, !tbaa !2
+  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2
   %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16)
-  %24 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
-  %25 = load <32 x i8> addrspace(2)* %24, !tbaa !2
-  %26 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
-  %27 = load <16 x i8> addrspace(2)* %26, !tbaa !2
+  %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2
+  %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2
   %28 = fcmp oeq float %23, 0.0
   br i1 %28, label %if, label %else
 
@@ -276,7 +276,7 @@ endif:
 ; CHECK: s_endpgm
 define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
 entry:
-  %0 = load float addrspace(1)* %in0
+  %0 = load float, float addrspace(1)* %in0
   %1 = fcmp oeq float %0, 0.0
   br i1 %1, label %if0, label %endif
 
@@ -334,13 +334,13 @@ attributes #0 = { "ShaderType"="0" }
 ; CHECK: s_endpgm
 define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
-  %tmp = getelementptr [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
-  %tmp22 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
+  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
   %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
-  %tmp25 = getelementptr [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
-  %tmp26 = load <8 x i32> addrspace(2)* %tmp25, !tbaa !0
-  %tmp27 = getelementptr [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
-  %tmp28 = load <4 x i32> addrspace(2)* %tmp27, !tbaa !0
+  %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
+  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0
+  %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
+  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0
   %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
   %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
   %tmp31 = bitcast float %tmp23 to i32
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index f89353b10844..53b63dc4b8ad 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll
@@ -15,9 +15,9 @@
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
   %result = shl <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -42,9 +42,9 @@ define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
   %result = shl <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -69,9 +69,9 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
-  %a = load i64 addrspace(1) * %in
-  %b = load i64 addrspace(1) * %b_ptr
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1) * %in
+  %b = load i64, i64 addrspace(1) * %b_ptr
   %result = shl i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -108,9 +108,9 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64> addrspace(1) * %in
-  %b = load <2 x i64> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
   %result = shl <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -171,9 +171,9 @@ define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64> addrspace(1) * %in
-  %b = load <4 x i64> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
   %result = shl <4 x i64> %a, %b
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/shl_add_constant.ll b/test/CodeGen/R600/shl_add_constant.ll
index 6915495beece..b1485bfaaebb 100644
--- a/test/CodeGen/R600/shl_add_constant.ll
+++ b/test/CodeGen/R600/shl_add_constant.ll
@@ -11,8 +11,8 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; SI: s_endpgm
 define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %ptr = getelementptr i32 addrspace(1)* %in, i32 %tid.x
-  %val = load i32 addrspace(1)* %ptr, align 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
   %add = add i32 %val, 9
   %result = shl i32 %add, 2
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -27,8 +27,8 @@ define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; SI: s_endpgm
 define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %ptr = getelementptr i32 addrspace(1)* %in, i32 %tid.x
-  %val = load i32 addrspace(1)* %ptr, align 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
   %add = add i32 %val, 9
   %result = shl i32 %add, 2
   store i32 %result, i32 addrspace(1)* %out0, align 4
@@ -45,8 +45,8 @@ define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1
 ; SI: s_endpgm
 define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
-  %ptr = getelementptr i32 addrspace(1)* %in, i32 %tid.x
-  %val = load i32 addrspace(1)* %ptr, align 4
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
   %shl = add i32 %val, 999
   %result = shl i32 %shl, 2
   store i32 %result, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/shl_add_ptr.ll b/test/CodeGen/R600/shl_add_ptr.ll
index 15602e820608..6671e909cd1d 100644
--- a/test/CodeGen/R600/shl_add_ptr.ll
+++ b/test/CodeGen/R600/shl_add_ptr.ll
@@ -17,13 +17,13 @@ declare i32 @llvm.r600.read.tidig.x() #1
 
 ; SI-LABEL: {{^}}load_shl_base_lds_0:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 [M0]
+; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
 ; SI: s_endpgm
 define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
   store float %val0, float addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad
 
 ; SI-LABEL: {{^}}load_shl_base_lds_1:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 [M0]
+; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
 ; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}}
 ; SI-DAG: buffer_store_dword [[RESULT]]
 ; SI-DAG: buffer_store_dword [[ADDUSE]]
@@ -42,8 +42,8 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad
 define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
   %shl_add_use = shl i32 %idx.0, 2
   store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4
   store float %val0, float addrspace(1)* %out
@@ -58,8 +58,8 @@ define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %ad
 define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 65535
-  %arrayidx0 = getelementptr inbounds [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
-  %val0 = load i8 addrspace(3)* %arrayidx0
+  %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
+  %val0 = load i8, i8 addrspace(3)* %arrayidx0
   store i32 %idx.0, i32 addrspace(1)* %add_use
   store i8 %val0, i8 addrspace(1)* %out
   ret void
@@ -69,17 +69,17 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; pointer can be used with an offset into the second one.
 
 ; SI-LABEL: {{^}}load_shl_base_lds_2:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: s_mov_b32 m0, -1
-; SI-NEXT: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 [M0]
+; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
 ; SI: s_endpgm
 define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float addrspace(3)* %arrayidx0, align 4
-  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
   %sum = fadd float %val0, %val1
   store float %sum, float addrspace(1)* %out, align 4
   ret void
@@ -87,12 +87,12 @@ define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
 
 ; SI-LABEL: {{^}}store_shl_base_lds_0:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
-; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 [M0]
+; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
 define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
   store float 1.0, float addrspace(3)* %arrayidx0, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
   ret void
@@ -107,8 +107,8 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 ; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
-;   %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
-;   %val = load atomic i32 addrspace(3)* %arrayidx0 seq_cst, align 4
+;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4
 ;   store i32 %val, i32 addrspace(1)* %out, align 4
 ;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
 ;   ret void
@@ -122,7 +122,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -137,7 +137,7 @@ define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace
 define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -151,7 +151,7 @@ define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -165,7 +165,7 @@ define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -179,7 +179,7 @@ define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -193,7 +193,7 @@ define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -207,7 +207,7 @@ define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -217,7 +217,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
-;   %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
 ;   %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
 ;   store i32 %val, i32 addrspace(1)* %out, align 4
 ;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -231,7 +231,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -245,7 +245,7 @@ define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -259,7 +259,7 @@ define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
@@ -273,7 +273,7 @@ define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
   %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
   store i32 %val, i32 addrspace(1)* %out, align 4
   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
diff --git a/test/CodeGen/R600/si-annotate-cf.ll b/test/CodeGen/R600/si-annotate-cf.ll
index 1b49a8272fa3..bbcb861f37dc 100644
--- a/test/CodeGen/R600/si-annotate-cf.ll
+++ b/test/CodeGen/R600/si-annotate-cf.ll
@@ -29,10 +29,10 @@ ENDIF:
 ; FIXME: This could be folded into the s_or_b64 instruction
 ; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
 ; SI: [[LOOP_LABEL:[A-Z0-9]+]]
-; SI: v_cmp_ne_i32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
 
 ; SI_IF_BREAK instruction:
-; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], [[COND]], [[ZERO]]
+; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]
 
 ; SI_LOOP instruction:
 ; SI: s_andn2_b64 exec, exec, [[BREAK]]
diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll
index d6cbd0fd367d..944499a11461 100644
--- a/test/CodeGen/R600/si-lod-bias.ll
+++ b/test/CodeGen/R600/si-lod-bias.ll
@@ -9,13 +9,13 @@
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  %23 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
-  %24 = load <32 x i8> addrspace(2)* %23, !tbaa !1
-  %25 = getelementptr <16 x i8> addrspace(2)* %1, i32 0
-  %26 = load <16 x i8> addrspace(2)* %25, !tbaa !1
+  %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
+  %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1
+  %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
+  %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1
   %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
   %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
   %29 = bitcast float %22 to i32
diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll
index 18fda20b0d14..84652701f773 100644
--- a/test/CodeGen/R600/si-sgpr-spill.ll
+++ b/test/CodeGen/R600/si-sgpr-spill.ll
@@ -13,8 +13,8 @@
 
 define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
-  %22 = load <16 x i8> addrspace(2)* %21, !tbaa !0
+  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
   %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96)
   %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100)
   %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104)
@@ -53,38 +53,38 @@ main_body:
   %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372)
   %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376)
   %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384)
-  %61 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
-  %62 = load <32 x i8> addrspace(2)* %61, !tbaa !0
-  %63 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
-  %64 = load <16 x i8> addrspace(2)* %63, !tbaa !0
-  %65 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
-  %66 = load <32 x i8> addrspace(2)* %65, !tbaa !0
-  %67 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
-  %68 = load <16 x i8> addrspace(2)* %67, !tbaa !0
-  %69 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
-  %70 = load <32 x i8> addrspace(2)* %69, !tbaa !0
-  %71 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
-  %72 = load <16 x i8> addrspace(2)* %71, !tbaa !0
-  %73 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
-  %74 = load <32 x i8> addrspace(2)* %73, !tbaa !0
-  %75 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
-  %76 = load <16 x i8> addrspace(2)* %75, !tbaa !0
-  %77 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
-  %78 = load <32 x i8> addrspace(2)* %77, !tbaa !0
-  %79 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
-  %80 = load <16 x i8> addrspace(2)* %79, !tbaa !0
-  %81 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
-  %82 = load <32 x i8> addrspace(2)* %81, !tbaa !0
-  %83 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
-  %84 = load <16 x i8> addrspace(2)* %83, !tbaa !0
-  %85 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
-  %86 = load <32 x i8> addrspace(2)* %85, !tbaa !0
-  %87 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
-  %88 = load <16 x i8> addrspace(2)* %87, !tbaa !0
-  %89 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
-  %90 = load <32 x i8> addrspace(2)* %89, !tbaa !0
-  %91 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
-  %92 = load <16 x i8> addrspace(2)* %91, !tbaa !0
+  %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0
+  %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0
+  %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
+  %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0
+  %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
+  %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0
+  %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
+  %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0
+  %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
+  %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0
+  %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
+  %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0
+  %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
+  %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0
+  %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
+  %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0
+  %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
+  %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0
+  %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
+  %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0
+  %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
+  %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0
+  %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
+  %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0
+  %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
+  %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0
+  %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
+  %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0
+  %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
+  %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0
   %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
   %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
   %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
@@ -103,29 +103,29 @@ main_body:
   %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
   %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
   %110 = call i32 @llvm.SI.tid()
-  %111 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110
+  %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110
   %112 = bitcast float %93 to i32
   store i32 %112, i32 addrspace(3)* %111
   %113 = bitcast float %94 to i32
   store i32 %113, i32 addrspace(3)* %111
   %114 = call i32 @llvm.SI.tid()
-  %115 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114
+  %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114
   %116 = and i32 %114, -4
-  %117 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116
+  %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116
   %118 = add i32 %116, 1
-  %119 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118
+  %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118
   %120 = bitcast float %93 to i32
   store i32 %120, i32 addrspace(3)* %115
-  %121 = load i32 addrspace(3)* %117
+  %121 = load i32, i32 addrspace(3)* %117
   %122 = bitcast i32 %121 to float
-  %123 = load i32 addrspace(3)* %119
+  %123 = load i32, i32 addrspace(3)* %119
   %124 = bitcast i32 %123 to float
   %125 = fsub float %124, %122
   %126 = bitcast float %94 to i32
   store i32 %126, i32 addrspace(3)* %115
-  %127 = load i32 addrspace(3)* %117
+  %127 = load i32, i32 addrspace(3)* %117
   %128 = bitcast i32 %127 to float
-  %129 = load i32 addrspace(3)* %119
+  %129 = load i32, i32 addrspace(3)* %119
   %130 = bitcast i32 %129 to float
   %131 = fsub float %130, %128
   %132 = insertelement <4 x float> undef, float %125, i32 0
@@ -139,7 +139,7 @@ main_body:
   %140 = fmul float %60, %94
   %141 = fmul float %60, %94
   %142 = call i32 @llvm.SI.tid()
-  %143 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142
+  %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142
   %144 = bitcast float %138 to i32
   store i32 %144, i32 addrspace(3)* %143
   %145 = bitcast float %139 to i32
@@ -149,37 +149,37 @@ main_body:
   %147 = bitcast float %141 to i32
   store i32 %147, i32 addrspace(3)* %143
   %148 = call i32 @llvm.SI.tid()
-  %149 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148
+  %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148
   %150 = and i32 %148, -4
-  %151 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150
+  %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150
   %152 = add i32 %150, 2
-  %153 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152
+  %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152
   %154 = bitcast float %138 to i32
   store i32 %154, i32 addrspace(3)* %149
-  %155 = load i32 addrspace(3)* %151
+  %155 = load i32, i32 addrspace(3)* %151
   %156 = bitcast i32 %155 to float
-  %157 = load i32 addrspace(3)* %153
+  %157 = load i32, i32 addrspace(3)* %153
   %158 = bitcast i32 %157 to float
   %159 = fsub float %158, %156
   %160 = bitcast float %139 to i32
   store i32 %160, i32 addrspace(3)* %149
-  %161 = load i32 addrspace(3)* %151
+  %161 = load i32, i32 addrspace(3)* %151
   %162 = bitcast i32 %161 to float
-  %163 = load i32 addrspace(3)* %153
+  %163 = load i32, i32 addrspace(3)* %153
   %164 = bitcast i32 %163 to float
   %165 = fsub float %164, %162
   %166 = bitcast float %140 to i32
   store i32 %166, i32 addrspace(3)* %149
-  %167 = load i32 addrspace(3)* %151
+  %167 = load i32, i32 addrspace(3)* %151
   %168 = bitcast i32 %167 to float
-  %169 = load i32 addrspace(3)* %153
+  %169 = load i32, i32 addrspace(3)* %153
   %170 = bitcast i32 %169 to float
   %171 = fsub float %170, %168
   %172 = bitcast float %141 to i32
   store i32 %172, i32 addrspace(3)* %149
-  %173 = load i32 addrspace(3)* %151
+  %173 = load i32, i32 addrspace(3)* %151
   %174 = bitcast i32 %173 to float
-  %175 = load i32 addrspace(3)* %153
+  %175 = load i32, i32 addrspace(3)* %153
   %176 = bitcast i32 %175 to float
   %177 = fsub float %176, %174
   %178 = insertelement <4 x float> undef, float %159, i32 0
@@ -694,8 +694,8 @@ attributes #4 = { nounwind readonly }
 ; CHECK: s_endpgm
 define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
-  %22 = load <16 x i8> addrspace(2)* %21, !tbaa !0
+  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
   %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0)
   %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4)
   %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8)
@@ -799,42 +799,42 @@ main_body:
   %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716)
   %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864)
   %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868)
-  %126 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
-  %127 = load <32 x i8> addrspace(2)* %126, !tbaa !0
-  %128 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
-  %129 = load <16 x i8> addrspace(2)* %128, !tbaa !0
-  %130 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
-  %131 = load <32 x i8> addrspace(2)* %130, !tbaa !0
-  %132 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
-  %133 = load <16 x i8> addrspace(2)* %132, !tbaa !0
-  %134 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
-  %135 = load <32 x i8> addrspace(2)* %134, !tbaa !0
-  %136 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
-  %137 = load <16 x i8> addrspace(2)* %136, !tbaa !0
-  %138 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
-  %139 = load <32 x i8> addrspace(2)* %138, !tbaa !0
-  %140 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
-  %141 = load <16 x i8> addrspace(2)* %140, !tbaa !0
-  %142 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
-  %143 = load <32 x i8> addrspace(2)* %142, !tbaa !0
-  %144 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
-  %145 = load <16 x i8> addrspace(2)* %144, !tbaa !0
-  %146 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
-  %147 = load <32 x i8> addrspace(2)* %146, !tbaa !0
-  %148 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
-  %149 = load <16 x i8> addrspace(2)* %148, !tbaa !0
-  %150 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
-  %151 = load <32 x i8> addrspace(2)* %150, !tbaa !0
-  %152 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
-  %153 = load <16 x i8> addrspace(2)* %152, !tbaa !0
-  %154 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
-  %155 = load <32 x i8> addrspace(2)* %154, !tbaa !0
-  %156 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
-  %157 = load <16 x i8> addrspace(2)* %156, !tbaa !0
-  %158 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8
-  %159 = load <32 x i8> addrspace(2)* %158, !tbaa !0
-  %160 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8
-  %161 = load <16 x i8> addrspace(2)* %160, !tbaa !0
+  %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0
+  %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0
+  %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
+  %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0
+  %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
+  %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0
+  %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
+  %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0
+  %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
+  %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0
+  %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
+  %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0
+  %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
+  %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0
+  %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
+  %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0
+  %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
+  %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0
+  %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
+  %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0
+  %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
+  %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0
+  %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
+  %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0
+  %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
+  %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0
+  %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
+  %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0
+  %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
+  %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0
+  %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8
+  %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0
+  %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8
+  %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0
   %162 = fcmp ugt float %17, 0.000000e+00
   %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00
   %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
diff --git a/test/CodeGen/R600/si-spill-cf.ll b/test/CodeGen/R600/si-spill-cf.ll
new file mode 100644
index 000000000000..4b2d8ec6bf0a
--- /dev/null
+++ b/test/CodeGen/R600/si-spill-cf.ll
@@ -0,0 +1,501 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; If this occurs it is likely due to reordering and the restore was
+; originally supposed to happen before SI_END_CF.
+; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
+; SI-NOT: v_readlane_b32 [[SAVED]]
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
+  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
+  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
+  %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
+  %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
+  %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
+  %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
+  %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
+  %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
+  %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
+  %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
+  %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
+  %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
+  %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
+  %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
+  %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
+  %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
+  %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
+  %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
+  %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
+  %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
+  %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
+  %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
+  %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
+  %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
+  %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
+  %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
+  %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
+  %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
+  %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
+  %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
+  %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
+  %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
+  %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
+  %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
+  %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
+  %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
+  %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
+  %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
+  %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
+  %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
+  %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
+  %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
+  %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
+  %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
+  %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
+  %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
+  %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
+  %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
+  %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
+  %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
+  %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
+  %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
+  %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
+  %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
+  %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
+  %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
+  %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
+  %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
+  %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
+  %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
+  %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
+  %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
+  %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
+  %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
+  %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF2795, %main_body
+  %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
+  %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
+  %67 = icmp sgt i32 undef, 4
+  br i1 %67, label %ENDLOOP, label %ENDIF
+
+ENDLOOP:                                          ; preds = %ELSE2566, %LOOP
+  %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %69 = fsub float %2, undef
+  %70 = fsub float %3, undef
+  %71 = fsub float %4, undef
+  %72 = fmul float %69, 0.000000e+00
+  %73 = fmul float %70, undef
+  %74 = fmul float %71, undef
+  %75 = fsub float %6, undef
+  %76 = fsub float %7, undef
+  %77 = fmul float %75, undef
+  %78 = fmul float %76, 0.000000e+00
+  %79 = call float @llvm.minnum.f32(float %74, float %78)
+  %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00)
+  %81 = call float @llvm.maxnum.f32(float %73, float %77)
+  %82 = call float @llvm.maxnum.f32(float undef, float %79)
+  %83 = call float @llvm.minnum.f32(float %80, float %81)
+  %84 = call float @llvm.minnum.f32(float %83, float undef)
+  %85 = fsub float %14, undef
+  %86 = fsub float %15, undef
+  %87 = fsub float %16, undef
+  %88 = fmul float %85, undef
+  %89 = fmul float %86, undef
+  %90 = fmul float %87, undef
+  %91 = fsub float %17, undef
+  %92 = fsub float %18, undef
+  %93 = fsub float %19, undef
+  %94 = fmul float %91, 0.000000e+00
+  %95 = fmul float %92, undef
+  %96 = fmul float %93, undef
+  %97 = call float @llvm.minnum.f32(float %89, float %95)
+  %98 = call float @llvm.maxnum.f32(float %88, float %94)
+  %99 = call float @llvm.maxnum.f32(float %90, float %96)
+  %100 = call float @llvm.maxnum.f32(float undef, float %97)
+  %101 = call float @llvm.maxnum.f32(float %100, float undef)
+  %102 = call float @llvm.minnum.f32(float %98, float undef)
+  %103 = call float @llvm.minnum.f32(float %102, float %99)
+  %104 = fsub float %30, undef
+  %105 = fsub float %31, undef
+  %106 = fmul float %104, 0.000000e+00
+  %107 = fmul float %105, 0.000000e+00
+  %108 = call float @llvm.minnum.f32(float undef, float %106)
+  %109 = call float @llvm.maxnum.f32(float undef, float %107)
+  %110 = call float @llvm.maxnum.f32(float undef, float %108)
+  %111 = call float @llvm.maxnum.f32(float %110, float undef)
+  %112 = call float @llvm.minnum.f32(float undef, float %109)
+  %113 = fsub float %32, undef
+  %114 = fsub float %33, undef
+  %115 = fsub float %34, undef
+  %116 = fmul float %113, 0.000000e+00
+  %117 = fmul float %114, undef
+  %118 = fmul float %115, undef
+  %119 = fsub float %35, undef
+  %120 = fsub float %36, undef
+  %121 = fsub float %37, undef
+  %122 = fmul float %119, undef
+  %123 = fmul float %120, undef
+  %124 = fmul float %121, undef
+  %125 = call float @llvm.minnum.f32(float %116, float %122)
+  %126 = call float @llvm.minnum.f32(float %117, float %123)
+  %127 = call float @llvm.minnum.f32(float %118, float %124)
+  %128 = call float @llvm.maxnum.f32(float %125, float %126)
+  %129 = call float @llvm.maxnum.f32(float %128, float %127)
+  %130 = fsub float %38, undef
+  %131 = fsub float %39, undef
+  %132 = fsub float %40, undef
+  %133 = fmul float %130, 0.000000e+00
+  %134 = fmul float %131, undef
+  %135 = fmul float %132, undef
+  %136 = fsub float %41, undef
+  %137 = fsub float %42, undef
+  %138 = fsub float %43, undef
+  %139 = fmul float %136, undef
+  %140 = fmul float %137, undef
+  %141 = fmul float %138, undef
+  %142 = call float @llvm.minnum.f32(float %133, float %139)
+  %143 = call float @llvm.minnum.f32(float %134, float %140)
+  %144 = call float @llvm.minnum.f32(float %135, float %141)
+  %145 = call float @llvm.maxnum.f32(float %142, float %143)
+  %146 = call float @llvm.maxnum.f32(float %145, float %144)
+  %147 = fsub float %44, undef
+  %148 = fsub float %45, undef
+  %149 = fsub float %46, undef
+  %150 = fmul float %147, 0.000000e+00
+  %151 = fmul float %148, 0.000000e+00
+  %152 = fmul float %149, undef
+  %153 = fsub float %47, undef
+  %154 = fsub float %48, undef
+  %155 = fsub float %49, undef
+  %156 = fmul float %153, undef
+  %157 = fmul float %154, 0.000000e+00
+  %158 = fmul float %155, undef
+  %159 = call float @llvm.minnum.f32(float %150, float %156)
+  %160 = call float @llvm.minnum.f32(float %151, float %157)
+  %161 = call float @llvm.minnum.f32(float %152, float %158)
+  %162 = call float @llvm.maxnum.f32(float %159, float %160)
+  %163 = call float @llvm.maxnum.f32(float %162, float %161)
+  %164 = fsub float %50, undef
+  %165 = fsub float %51, undef
+  %166 = fsub float %52, undef
+  %167 = fmul float %164, undef
+  %168 = fmul float %165, 0.000000e+00
+  %169 = fmul float %166, 0.000000e+00
+  %170 = fsub float %53, undef
+  %171 = fsub float %54, undef
+  %172 = fsub float %55, undef
+  %173 = fdiv float 1.000000e+00, %temp18.0
+  %174 = fmul float %170, undef
+  %175 = fmul float %171, undef
+  %176 = fmul float %172, %173
+  %177 = call float @llvm.minnum.f32(float %167, float %174)
+  %178 = call float @llvm.minnum.f32(float %168, float %175)
+  %179 = call float @llvm.minnum.f32(float %169, float %176)
+  %180 = call float @llvm.maxnum.f32(float %177, float %178)
+  %181 = call float @llvm.maxnum.f32(float %180, float %179)
+  %182 = fsub float %62, undef
+  %183 = fsub float %63, undef
+  %184 = fsub float %64, undef
+  %185 = fmul float %182, 0.000000e+00
+  %186 = fmul float %183, undef
+  %187 = fmul float %184, undef
+  %188 = fsub float %65, undef
+  %189 = fsub float %66, undef
+  %190 = fmul float %188, undef
+  %191 = fmul float %189, undef
+  %192 = call float @llvm.maxnum.f32(float %185, float %190)
+  %193 = call float @llvm.maxnum.f32(float %186, float %191)
+  %194 = call float @llvm.maxnum.f32(float %187, float undef)
+  %195 = call float @llvm.minnum.f32(float %192, float %193)
+  %196 = call float @llvm.minnum.f32(float %195, float %194)
+  %.temp292.7 = select i1 undef, float %163, float undef
+  %temp292.9 = select i1 false, float %181, float %.temp292.7
+  %.temp292.9 = select i1 undef, float undef, float %temp292.9
+  %197 = fcmp ogt float undef, 0.000000e+00
+  %198 = fcmp olt float undef, %196
+  %199 = and i1 %197, %198
+  %200 = fcmp olt float undef, %.temp292.9
+  %201 = and i1 %199, %200
+  %temp292.11 = select i1 %201, float undef, float %.temp292.9
+  br i1 undef, label %IF2565, label %ELSE2566
+
+IF2565:                                           ; preds = %ENDIF
+  br i1 false, label %ENDIF2582, label %ELSE2584
+
+ELSE2566:                                         ; preds = %ENDIF
+  %202 = fcmp oeq float %temp292.11, 1.000000e+04
+  br i1 %202, label %ENDLOOP, label %ELSE2593
+
+ENDIF2564:                                        ; preds = %ENDIF2594, %ENDIF2588
+  %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
+  %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ]
+  %203 = fsub float %5, undef
+  %204 = fmul float %203, undef
+  %205 = call float @llvm.maxnum.f32(float undef, float %204)
+  %206 = call float @llvm.minnum.f32(float %205, float undef)
+  %207 = call float @llvm.minnum.f32(float %206, float undef)
+  %208 = fcmp ogt float undef, 0.000000e+00
+  %209 = fcmp olt float undef, 1.000000e+00
+  %210 = and i1 %208, %209
+  %211 = fcmp olt float undef, %207
+  %212 = and i1 %210, %211
+  br i1 %212, label %ENDIF2795, label %ELSE2797
+
+ELSE2584:                                         ; preds = %IF2565
+  br label %ENDIF2582
+
+ENDIF2582:                                        ; preds = %ELSE2584, %IF2565
+  %213 = fadd float %1, undef
+  %214 = fadd float 0.000000e+00, %213
+  %215 = call float @llvm.AMDIL.fraction.(float %214)
+  br i1 undef, label %IF2589, label %ELSE2590
+
+IF2589:                                           ; preds = %ENDIF2582
+  br label %ENDIF2588
+
+ELSE2590:                                         ; preds = %ENDIF2582
+  br label %ENDIF2588
+
+ENDIF2588:                                        ; preds = %ELSE2590, %IF2589
+  %216 = fsub float 1.000000e+00, %215
+  %217 = call float @llvm.sqrt.f32(float %216)
+  %218 = fmul float %217, undef
+  %219 = fadd float %218, undef
+  br label %ENDIF2564
+
+ELSE2593:                                         ; preds = %ELSE2566
+  %220 = fcmp oeq float %temp292.11, %82
+  %221 = fcmp olt float %82, %84
+  %222 = and i1 %220, %221
+  br i1 %222, label %ENDIF2594, label %ELSE2596
+
+ELSE2596:                                         ; preds = %ELSE2593
+  %223 = fcmp oeq float %temp292.11, %101
+  %224 = fcmp olt float %101, %103
+  %225 = and i1 %223, %224
+  br i1 %225, label %ENDIF2594, label %ELSE2632
+
+ENDIF2594:                                        ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
+  %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
+  %226 = fmul float %temp894.2, undef
+  br label %ENDIF2564
+
+ELSE2632:                                         ; preds = %ELSE2596
+  br i1 undef, label %ENDIF2594, label %ELSE2650
+
+ELSE2650:                                         ; preds = %ELSE2632
+  %227 = fcmp oeq float %temp292.11, %111
+  %228 = fcmp olt float %111, %112
+  %229 = and i1 %227, %228
+  br i1 %229, label %IF2667, label %ELSE2668
+
+IF2667:                                           ; preds = %ELSE2650
+  br i1 undef, label %ENDIF2594, label %ELSE2671
+
+ELSE2668:                                         ; preds = %ELSE2650
+  %230 = fcmp oeq float %temp292.11, %129
+  %231 = fcmp olt float %129, undef
+  %232 = and i1 %230, %231
+  br i1 %232, label %ENDIF2594, label %ELSE2686
+
+ELSE2671:                                         ; preds = %IF2667
+  br label %ENDIF2594
+
+ELSE2686:                                         ; preds = %ELSE2668
+  %233 = fcmp oeq float %temp292.11, %146
+  %234 = fcmp olt float %146, undef
+  %235 = and i1 %233, %234
+  br i1 %235, label %ENDIF2594, label %ELSE2704
+
+ELSE2704:                                         ; preds = %ELSE2686
+  %236 = fcmp oeq float %temp292.11, %181
+  %237 = fcmp olt float %181, undef
+  %238 = and i1 %236, %237
+  br i1 %238, label %ENDIF2594, label %ELSE2740
+
+ELSE2740:                                         ; preds = %ELSE2704
+  br i1 undef, label %IF2757, label %ELSE2758
+
+IF2757:                                           ; preds = %ELSE2740
+  br i1 undef, label %ENDIF2594, label %ELSE2761
+
+ELSE2758:                                         ; preds = %ELSE2740
+  br i1 undef, label %IF2775, label %ENDIF2594
+
+ELSE2761:                                         ; preds = %IF2757
+  br label %ENDIF2594
+
+IF2775:                                           ; preds = %ELSE2758
+  %239 = fcmp olt float undef, undef
+  br i1 %239, label %ENDIF2594, label %ELSE2779
+
+ELSE2779:                                         ; preds = %IF2775
+  br i1 undef, label %ENDIF2594, label %ELSE2782
+
+ELSE2782:                                         ; preds = %ELSE2779
+  br i1 undef, label %ENDIF2594, label %ELSE2785
+
+ELSE2785:                                         ; preds = %ELSE2782
+  %240 = fcmp olt float undef, 0.000000e+00
+  br i1 %240, label %ENDIF2594, label %ELSE2788
+
+ELSE2788:                                         ; preds = %ELSE2785
+  %241 = fcmp olt float 0.000000e+00, undef
+  %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00
+  br label %ENDIF2594
+
+ELSE2797:                                         ; preds = %ENDIF2564
+  %242 = fsub float %8, undef
+  %243 = fsub float %9, undef
+  %244 = fsub float %10, undef
+  %245 = fmul float %242, undef
+  %246 = fmul float %243, undef
+  %247 = fmul float %244, undef
+  %248 = fsub float %11, undef
+  %249 = fsub float %12, undef
+  %250 = fsub float %13, undef
+  %251 = fmul float %248, undef
+  %252 = fmul float %249, undef
+  %253 = fmul float %250, undef
+  %254 = call float @llvm.minnum.f32(float %245, float %251)
+  %255 = call float @llvm.minnum.f32(float %246, float %252)
+  %256 = call float @llvm.maxnum.f32(float %247, float %253)
+  %257 = call float @llvm.maxnum.f32(float %254, float %255)
+  %258 = call float @llvm.maxnum.f32(float %257, float undef)
+  %259 = call float @llvm.minnum.f32(float undef, float %256)
+  %260 = fcmp ogt float %258, 0.000000e+00
+  %261 = fcmp olt float %258, 1.000000e+00
+  %262 = and i1 %260, %261
+  %263 = fcmp olt float %258, %259
+  %264 = and i1 %262, %263
+  br i1 %264, label %ENDIF2795, label %ELSE2800
+
+ENDIF2795:                                        ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
+  br label %LOOP
+
+ELSE2800:                                         ; preds = %ELSE2797
+  br i1 undef, label %ENDIF2795, label %ELSE2803
+
+ELSE2803:                                         ; preds = %ELSE2800
+  %265 = fsub float %20, undef
+  %266 = fsub float %21, undef
+  %267 = fsub float %22, undef
+  %268 = fmul float %265, undef
+  %269 = fmul float %266, undef
+  %270 = fmul float %267, 0.000000e+00
+  %271 = fsub float %23, undef
+  %272 = fsub float %24, undef
+  %273 = fsub float %25, undef
+  %274 = fmul float %271, undef
+  %275 = fmul float %272, undef
+  %276 = fmul float %273, undef
+  %277 = call float @llvm.minnum.f32(float %268, float %274)
+  %278 = call float @llvm.maxnum.f32(float %269, float %275)
+  %279 = call float @llvm.maxnum.f32(float %270, float %276)
+  %280 = call float @llvm.maxnum.f32(float %277, float undef)
+  %281 = call float @llvm.maxnum.f32(float %280, float undef)
+  %282 = call float @llvm.minnum.f32(float undef, float %278)
+  %283 = call float @llvm.minnum.f32(float %282, float %279)
+  %284 = fcmp ogt float %281, 0.000000e+00
+  %285 = fcmp olt float %281, 1.000000e+00
+  %286 = and i1 %284, %285
+  %287 = fcmp olt float %281, %283
+  %288 = and i1 %286, %287
+  br i1 %288, label %ENDIF2795, label %ELSE2806
+
+ELSE2806:                                         ; preds = %ELSE2803
+  %289 = fsub float %26, undef
+  %290 = fsub float %27, undef
+  %291 = fsub float %28, undef
+  %292 = fmul float %289, undef
+  %293 = fmul float %290, 0.000000e+00
+  %294 = fmul float %291, undef
+  %295 = fsub float %29, undef
+  %296 = fmul float %295, undef
+  %297 = call float @llvm.minnum.f32(float %292, float %296)
+  %298 = call float @llvm.minnum.f32(float %293, float undef)
+  %299 = call float @llvm.maxnum.f32(float %294, float undef)
+  %300 = call float @llvm.maxnum.f32(float %297, float %298)
+  %301 = call float @llvm.maxnum.f32(float %300, float undef)
+  %302 = call float @llvm.minnum.f32(float undef, float %299)
+  %303 = fcmp ogt float %301, 0.000000e+00
+  %304 = fcmp olt float %301, 1.000000e+00
+  %305 = and i1 %303, %304
+  %306 = fcmp olt float %301, %302
+  %307 = and i1 %305, %306
+  br i1 %307, label %ENDIF2795, label %ELSE2809
+
+ELSE2809:                                         ; preds = %ELSE2806
+  br i1 undef, label %ENDIF2795, label %ELSE2812
+
+ELSE2812:                                         ; preds = %ELSE2809
+  br i1 undef, label %ENDIF2795, label %ELSE2815
+
+ELSE2815:                                         ; preds = %ELSE2812
+  br i1 undef, label %ENDIF2795, label %ELSE2818
+
+ELSE2818:                                         ; preds = %ELSE2815
+  br i1 undef, label %ENDIF2795, label %ELSE2821
+
+ELSE2821:                                         ; preds = %ELSE2818
+  %308 = fsub float %56, undef
+  %309 = fsub float %57, undef
+  %310 = fsub float %58, undef
+  %311 = fmul float %308, undef
+  %312 = fmul float %309, 0.000000e+00
+  %313 = fmul float %310, undef
+  %314 = fsub float %59, undef
+  %315 = fsub float %60, undef
+  %316 = fsub float %61, undef
+  %317 = fmul float %314, undef
+  %318 = fmul float %315, undef
+  %319 = fmul float %316, undef
+  %320 = call float @llvm.maxnum.f32(float %311, float %317)
+  %321 = call float @llvm.maxnum.f32(float %312, float %318)
+  %322 = call float @llvm.maxnum.f32(float %313, float %319)
+  %323 = call float @llvm.minnum.f32(float %320, float %321)
+  %324 = call float @llvm.minnum.f32(float %323, float %322)
+  %325 = fcmp ogt float undef, 0.000000e+00
+  %326 = fcmp olt float undef, 1.000000e+00
+  %327 = and i1 %325, %326
+  %328 = fcmp olt float undef, %324
+  %329 = and i1 %327, %328
+  br i1 %329, label %ENDIF2795, label %ELSE2824
+
+ELSE2824:                                         ; preds = %ELSE2821
+  %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
+  br label %ENDIF2795
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.fraction.(float) #2
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.minnum.f32(float, float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.maxnum.f32(float, float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
index f6dcb388248a..5a6129aaa3fa 100644
--- a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
@@ -14,14 +14,14 @@ declare void @llvm.AMDGPU.barrier.local() #2
 ; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
 ; CI: buffer_store_dword
 define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
-  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
 
-  %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
   store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
@@ -34,14 +34,14 @@ define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out,
 ; CI: buffer_store_dword
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
 define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
-  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
 
-  %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
   store volatile i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
@@ -54,15 +54,15 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
 ; CI: buffer_store_dword
 define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
-  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
 
-  %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
   store i32 99, i32 addrspace(1)* %gptr, align 4
   call void @llvm.AMDGPU.barrier.local() #2
-  %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
@@ -79,14 +79,14 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
 ; CI: buffer_load_dword
 ; CI: buffer_store_dword
 define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
-  %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
-  %tmp1 = load i32 addrspace(2)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
   store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
@@ -94,22 +94,20 @@ define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1
   ret void
 }
 
-; XXX: Should be able to reorder this, but the laods count as ordered
-
 ; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
 ; CI: buffer_load_dword
-; CI: ds_write_b32
 ; CI: buffer_load_dword
+; CI: ds_write_b32
 ; CI: buffer_store_dword
 define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
-  %ptr0 = load i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
-  %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
-  %tmp1 = load i32 addrspace(2)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
   store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
@@ -124,12 +122,12 @@ define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
 define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
-  %tmp1 = load i32 addrspace(2)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
   store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
@@ -143,12 +141,12 @@ define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
 define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32 addrspace(1)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32 addrspace(1)* %ptr0, i64 2
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2
 
-  %tmp1 = load i32 addrspace(1)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
   store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32 addrspace(1)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
@@ -165,15 +163,15 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 ; CI: buffer_store_dword
 ; CI: s_endpgm
 define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 3
-  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 101
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101
 
   store i32 123, i32 addrspace(3)* %ptr1, align 4
-  %tmp1 = load i32 addrspace(3)* %ptr2, align 4
-  %tmp2 = load i32 addrspace(3)* %ptr3, align 4
+  %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4
   store i32 123, i32 addrspace(3)* %ptr2, align 4
-  %tmp3 = load i32 addrspace(3)* %ptr1, align 4
+  %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4
   store i32 789, i32 addrspace(3)* %ptr3, align 4
 
   %add.0 = add nsw i32 %tmp2, %tmp1
@@ -191,15 +189,15 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 ; CI: buffer_store_dword
 ; CI: s_endpgm
 define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 3
-  %ptr2 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 101
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101
 
   store i32 123, i32 addrspace(1)* %ptr1, align 4
-  %tmp1 = load i32 addrspace(1)* %ptr2, align 4
-  %tmp2 = load i32 addrspace(1)* %ptr3, align 4
+  %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4
   store i32 123, i32 addrspace(1)* %ptr2, align 4
-  %tmp3 = load i32 addrspace(1)* %ptr1, align 4
+  %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4
   store i32 789, i32 addrspace(1)* %ptr3, align 4
 
   %add.0 = add nsw i32 %tmp2, %tmp1
@@ -213,19 +211,19 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
 ; XCI: TBUFFER_STORE_FORMAT
 ; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
 ; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 {
-;   %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+;   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
-;   %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
-;   %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+;   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+;   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
 
-;   %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
 
 ;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
 ;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
 ;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
 ;         i32 1, i32 0)
 
-;   %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
 
 ;   %add = add nsw i32 %tmp1, %tmp2
 
diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll
index 61812c61ba19..94c47fe3c600 100644
--- a/test/CodeGen/R600/si-vector-hang.ll
+++ b/test/CodeGen/R600/si-vector-hang.ll
@@ -17,52 +17,52 @@ target triple = "r600--"
 ; Function Attrs: nounwind
 define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
 entry:
-  %0 = load i8 addrspace(1)* %in0, align 1
+  %0 = load i8, i8 addrspace(1)* %in0, align 1
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
-  %arrayidx2.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 1
-  %2 = load i8 addrspace(1)* %arrayidx2.i.i, align 1
+  %arrayidx2.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 1
+  %2 = load i8, i8 addrspace(1)* %arrayidx2.i.i, align 1
   %3 = insertelement <8 x i8> %1, i8 %2, i32 1
-  %arrayidx6.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 2
-  %4 = load i8 addrspace(1)* %arrayidx6.i.i, align 1
+  %arrayidx6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 2
+  %4 = load i8, i8 addrspace(1)* %arrayidx6.i.i, align 1
   %5 = insertelement <8 x i8> %3, i8 %4, i32 2
-  %arrayidx10.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 3
-  %6 = load i8 addrspace(1)* %arrayidx10.i.i, align 1
+  %arrayidx10.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 3
+  %6 = load i8, i8 addrspace(1)* %arrayidx10.i.i, align 1
   %7 = insertelement <8 x i8> %5, i8 %6, i32 3
-  %arrayidx.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 4
-  %8 = load i8 addrspace(1)* %arrayidx.i.i, align 1
+  %arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 4
+  %8 = load i8, i8 addrspace(1)* %arrayidx.i.i, align 1
   %9 = insertelement <8 x i8> undef, i8 %8, i32 0
-  %arrayidx2.i9.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 5
-  %10 = load i8 addrspace(1)* %arrayidx2.i9.i, align 1
+  %arrayidx2.i9.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 5
+  %10 = load i8, i8 addrspace(1)* %arrayidx2.i9.i, align 1
   %11 = insertelement <8 x i8> %9, i8 %10, i32 1
-  %arrayidx6.i11.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 6
-  %12 = load i8 addrspace(1)* %arrayidx6.i11.i, align 1
+  %arrayidx6.i11.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 6
+  %12 = load i8, i8 addrspace(1)* %arrayidx6.i11.i, align 1
   %13 = insertelement <8 x i8> %11, i8 %12, i32 2
-  %arrayidx10.i13.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 7
-  %14 = load i8 addrspace(1)* %arrayidx10.i13.i, align 1
+  %arrayidx10.i13.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 7
+  %14 = load i8, i8 addrspace(1)* %arrayidx10.i13.i, align 1
   %15 = insertelement <8 x i8> %13, i8 %14, i32 3
   %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  %16 = load i8 addrspace(1)* %in1, align 1
+  %16 = load i8, i8 addrspace(1)* %in1, align 1
   %17 = insertelement <8 x i8> undef, i8 %16, i32 0
-  %arrayidx2.i.i4 = getelementptr inbounds i8 addrspace(1)* %in1, i64 1
-  %18 = load i8 addrspace(1)* %arrayidx2.i.i4, align 1
+  %arrayidx2.i.i4 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 1
+  %18 = load i8, i8 addrspace(1)* %arrayidx2.i.i4, align 1
   %19 = insertelement <8 x i8> %17, i8 %18, i32 1
-  %arrayidx6.i.i5 = getelementptr inbounds i8 addrspace(1)* %in1, i64 2
-  %20 = load i8 addrspace(1)* %arrayidx6.i.i5, align 1
+  %arrayidx6.i.i5 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 2
+  %20 = load i8, i8 addrspace(1)* %arrayidx6.i.i5, align 1
   %21 = insertelement <8 x i8> %19, i8 %20, i32 2
-  %arrayidx10.i.i6 = getelementptr inbounds i8 addrspace(1)* %in1, i64 3
-  %22 = load i8 addrspace(1)* %arrayidx10.i.i6, align 1
+  %arrayidx10.i.i6 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 3
+  %22 = load i8, i8 addrspace(1)* %arrayidx10.i.i6, align 1
   %23 = insertelement <8 x i8> %21, i8 %22, i32 3
-  %arrayidx.i.i7 = getelementptr inbounds i8 addrspace(1)* %in1, i64 4
-  %24 = load i8 addrspace(1)* %arrayidx.i.i7, align 1
+  %arrayidx.i.i7 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 4
+  %24 = load i8, i8 addrspace(1)* %arrayidx.i.i7, align 1
   %25 = insertelement <8 x i8> undef, i8 %24, i32 0
-  %arrayidx2.i9.i8 = getelementptr inbounds i8 addrspace(1)* %in1, i64 5
-  %26 = load i8 addrspace(1)* %arrayidx2.i9.i8, align 1
+  %arrayidx2.i9.i8 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 5
+  %26 = load i8, i8 addrspace(1)* %arrayidx2.i9.i8, align 1
   %27 = insertelement <8 x i8> %25, i8 %26, i32 1
-  %arrayidx6.i11.i9 = getelementptr inbounds i8 addrspace(1)* %in1, i64 6
-  %28 = load i8 addrspace(1)* %arrayidx6.i11.i9, align 1
+  %arrayidx6.i11.i9 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 6
+  %28 = load i8, i8 addrspace(1)* %arrayidx6.i11.i9, align 1
   %29 = insertelement <8 x i8> %27, i8 %28, i32 2
-  %arrayidx10.i13.i10 = getelementptr inbounds i8 addrspace(1)* %in1, i64 7
-  %30 = load i8 addrspace(1)* %arrayidx10.i13.i10, align 1
+  %arrayidx10.i13.i10 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 7
+  %30 = load i8, i8 addrspace(1)* %arrayidx10.i13.i10, align 1
   %31 = insertelement <8 x i8> %29, i8 %30, i32 3
   %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11
@@ -70,25 +70,25 @@ entry:
   %32 = extractelement <8 x i8> %cond.i, i32 0
   store i8 %32, i8 addrspace(1)* %out, align 1
   %33 = extractelement <8 x i8> %cond.i, i32 1
-  %arrayidx2.i.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 1
+  %arrayidx2.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
   store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1
   %34 = extractelement <8 x i8> %cond.i, i32 2
-  %arrayidx.i.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 2
+  %arrayidx.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 2
   store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1
   %35 = extractelement <8 x i8> %cond.i, i32 3
-  %arrayidx2.i6.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 3
+  %arrayidx2.i6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 3
   store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1
-  %arrayidx.i.i3 = getelementptr inbounds i8 addrspace(1)* %out, i64 4
+  %arrayidx.i.i3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4
   %36 = extractelement <8 x i8> %cond.i, i32 4
   store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1
   %37 = extractelement <8 x i8> %cond.i, i32 5
-  %arrayidx2.i.i6.i = getelementptr inbounds i8 addrspace(1)* %out, i64 5
+  %arrayidx2.i.i6.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 5
   store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1
   %38 = extractelement <8 x i8> %cond.i, i32 6
-  %arrayidx.i.i7.i = getelementptr inbounds i8 addrspace(1)* %out, i64 6
+  %arrayidx.i.i7.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 6
   store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1
   %39 = extractelement <8 x i8> %cond.i, i32 7
-  %arrayidx2.i6.i8.i = getelementptr inbounds i8 addrspace(1)* %out, i64 7
+  %arrayidx2.i6.i8.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 7
   store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1
   ret void
 }
diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
index 9550c2a7f061..06bee114c23a 100644
--- a/test/CodeGen/R600/sign_extend.ll
+++ b/test/CodeGen/R600/sign_extend.ll
@@ -24,8 +24,9 @@ entry:
 }
 
 ; SI-LABEL: {{^}}s_sext_i1_to_i64:
-; SI: v_cndmask_b32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
+; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
 ; SI: s_endpgm
 define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
@@ -47,7 +48,7 @@ define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
 ; SI: v_ashr
 ; SI: s_endpgm
 define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %val = load i32 addrspace(1)* %in, align 4
+  %val = load i32, i32 addrspace(1)* %in, align 4
   %sext = sext i32 %val to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
index 28a413cd1b3c..dffee70b6b02 100644
--- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
+++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
@@ -22,16 +22,16 @@ define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
 define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
   %idx = add i32 %a, %b
   %alloca = alloca i64, i32 4
-  %gep0 = getelementptr i64* %alloca, i64 0
-  %gep1 = getelementptr i64* %alloca, i64 1
-  %gep2 = getelementptr i64* %alloca, i64 2
-  %gep3 = getelementptr i64* %alloca, i64 3
+  %gep0 = getelementptr i64, i64* %alloca, i64 0
+  %gep1 = getelementptr i64, i64* %alloca, i64 1
+  %gep2 = getelementptr i64, i64* %alloca, i64 2
+  %gep3 = getelementptr i64, i64* %alloca, i64 3
   store i64 24, i64* %gep0, align 8
   store i64 9334, i64* %gep1, align 8
   store i64 3935, i64* %gep2, align 8
   store i64 9342, i64* %gep3, align 8
-  %gep = getelementptr i64* %alloca, i32 %idx
-  %load = load i64* %gep, align 8
+  %gep = getelementptr i64, i64* %alloca, i32 %idx
+  %load = load i64, i64* %gep, align 8
   %mask = and i64 %load, 4294967296
   %add = add i64 %mask, -1
   store i64 %add, i64 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll
index 77844a6aa384..da4e91db3a38 100644
--- a/test/CodeGen/R600/sint_to_fp.f64.ll
+++ b/test/CodeGen/R600/sint_to_fp.f64.ll
@@ -10,12 +10,13 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   ret void
 }
 
+; FIXME: select on 0, 0
 ; SI-LABEL: {{^}}sint_to_fp_i1_f64:
 ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
 ; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
 ; uses an SGPR for [[CMP]]
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
@@ -52,8 +53,8 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr i64 addrspace(1)* %in, i32 %tid
-  %val = load i64 addrspace(1)* %gep, align 8
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep, align 8
   %result = sitofp i64 %val to double
   store double %result, double addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 6a291cfe9269..8506441d1361 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -35,7 +35,7 @@ define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
 ; SI: v_cvt_f32_i32_e32
 ; SI: v_cvt_f32_i32_e32
 define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %value = load <4 x i32> addrspace(1) * %in
+  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
index a66ad0201bf9..b0c18ca5959c 100644
--- a/test/CodeGen/R600/smrd.ll
+++ b/test/CodeGen/R600/smrd.ll
@@ -1,65 +1,73 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
 
 ; SMRD load with an immediate offset.
-; CHECK-LABEL: {{^}}smrd0:
-; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; GCN-LABEL: {{^}}smrd0:
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
 define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
-  %0 = getelementptr i32 addrspace(2)* %ptr, i64 1
-  %1 = load i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
+  %1 = load i32, i32 addrspace(2)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
 ; SMRD load with the largest possible immediate offset.
-; CHECK-LABEL: {{^}}smrd1:
-; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; GCN-LABEL: {{^}}smrd1:
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
 define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
-  %0 = getelementptr i32 addrspace(2)* %ptr, i64 255
-  %1 = load i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
+  %1 = load i32, i32 addrspace(2)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
 ; SMRD load with an offset greater than the largest possible immediate.
-; CHECK-LABEL: {{^}}smrd2:
-; CHECK: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
-; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
-; CHECK: s_endpgm
+; GCN-LABEL: {{^}}smrd2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; GCN: s_endpgm
 define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
-  %0 = getelementptr i32 addrspace(2)* %ptr, i64 256
-  %1 = load i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
+  %1 = load i32, i32 addrspace(2)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
 ; SMRD load with a 64-bit offset
-; CHECK-LABEL: {{^}}smrd3:
-; CHECK-DAG: s_mov_b32 s[[SHI:[0-9]+]], 4
-; CHECK-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
-; FIXME: We don't need to copy these values to VGPRs
-; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
-; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; GCN-LABEL: {{^}}smrd3:
+; FIXME: There are too many copies here because we don't fold immediates
+;        through REG_SEQUENCE
+; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
+; SI: s_mov_b32 s[[SHI:[0-9]+]], 4
+; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; FIXME: We should be able to use s_load_dword here
-; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
-; CHECK: s_endpgm
+; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
+; TODO: Add VI checks
+; GCN: s_endpgm
 define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
-  %0 = getelementptr i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
-  %1 = load i32 addrspace(2)* %0
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
+  %1 = load i32, i32 addrspace(2)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
 ; SMRD load using the load.const intrinsic with an immediate offset
-; CHECK-LABEL: {{^}}smrd_load_const0:
-; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; GCN-LABEL: {{^}}smrd_load_const0:
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
 define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
   ret void
@@ -67,12 +75,13 @@ main_body:
 
 ; SMRD load using the load.const intrinsic with the largest possible immediate
 ; offset.
-; CHECK-LABEL: {{^}}smrd_load_const1:
-; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; GCN-LABEL: {{^}}smrd_load_const1:
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
 define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
   ret void
@@ -80,13 +89,14 @@ main_body:
 ; SMRD load using the load.const intrinsic with an offset greater than the
 ; largets possible immediate.
 ; immediate offset.
-; CHECK-LABEL: {{^}}smrd_load_const2:
-; CHECK: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
-; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; GCN-LABEL: {{^}}smrd_load_const2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
-  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
   ret void
diff --git a/test/CodeGen/R600/split-scalar-i64-add.ll b/test/CodeGen/R600/split-scalar-i64-add.ll
index ec50fd9f4c1e..46409cdfae1c 100644
--- a/test/CodeGen/R600/split-scalar-i64-add.ll
+++ b/test/CodeGen/R600/split-scalar-i64-add.ll
@@ -37,8 +37,8 @@ define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64
 ; SI: v_addc_u32
 define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %gep = getelementptr i32 addrspace(1)* %in, i32 %tid
-  %load = load i32 addrspace(1)* %gep
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
index d6c6ccd28382..bcbc32f4c053 100644
--- a/test/CodeGen/R600/sra.ll
+++ b/test/CodeGen/R600/sra.ll
@@ -15,9 +15,9 @@
 ;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
   %result = ashr <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -42,9 +42,9 @@ define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
   %result = ashr <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -89,9 +89,9 @@ entry:
 
 define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
-  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
-  %a = load i64 addrspace(1) * %in
-  %b = load i64 addrspace(1) * %b_ptr
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1) * %in
+  %b = load i64, i64 addrspace(1) * %b_ptr
   %result = ashr i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -132,9 +132,9 @@ entry:
 ;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64> addrspace(1) * %in
-  %b = load <2 x i64> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
   %result = ashr <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -203,9 +203,9 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64> addrspace(1) * %in
-  %b = load <4 x i64> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
   %result = ashr <4 x i64> %a, %b
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/srem.ll b/test/CodeGen/R600/srem.ll
index 2aa8c7452542..c78fd549b316 100644
--- a/test/CodeGen/R600/srem.ll
+++ b/test/CodeGen/R600/srem.ll
@@ -1,51 +1,112 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s
 
 define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in
-  %den = load i32 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in
+  %den = load i32, i32 addrspace(1) * %den_ptr
   %result = srem i32 %num, %den
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
 define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32 addrspace(1) * %in
+  %num = load i32, i32 addrspace(1) * %in
   %result = srem i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}srem_i32_7:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493
+; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]],
+; SI: v_mul_lo_i32
+; SI: v_sub_i32
+; SI: s_endpgm
+define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = srem i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
 define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %num = load <2 x i32> addrspace(1) * %in
-  %den = load <2 x i32> addrspace(1) * %den_ptr
+  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
   %result = srem <2 x i32> %num, %den
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
 define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %num = load <2 x i32> addrspace(1) * %in
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = srem <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
 define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %num = load <4 x i32> addrspace(1) * %in
-  %den = load <4 x i32> addrspace(1) * %den_ptr
+  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
   %result = srem <4 x i32> %num, %den
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
 define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %num = load <4 x i32> addrspace(1) * %in
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %num = load i64, i64 addrspace(1) * %in
+  %den = load i64, i64 addrspace(1) * %den_ptr
+  %result = srem i64 %num, %den
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %num = load i64, i64 addrspace(1) * %in
+  %result = srem i64 %num, 4
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr
+  %result = srem <2 x i64> %num, %den
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %result = srem <2 x i64> %num, <i64 4, i64 4>
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr
+  %result = srem <4 x i64> %num, %den
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
index 9e7b35e8338a..4904d7fa1bd0 100644
--- a/test/CodeGen/R600/srl.ll
+++ b/test/CodeGen/R600/srl.ll
@@ -1,13 +1,15 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lshr_i32:
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %a = load i32 addrspace(1)* %in
-  %b = load i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = lshr i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -17,12 +19,15 @@ define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1)* %in
-  %b = load <2 x i32> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
   %result = lshr <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -34,14 +39,19 @@ define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1)* %in
-  %b = load <4 x i32> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
   %result = lshr <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -49,6 +59,7 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 
 ; FUNC-LABEL: {{^}}lshr_i64:
 ; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
 ; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
@@ -62,9 +73,9 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
-  %a = load i64 addrspace(1)* %in
-  %b = load i64 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1)* %in
+  %b = load i64, i64 addrspace(1)* %b_ptr
   %result = lshr i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -74,6 +85,9 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 ; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
 ; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
@@ -97,9 +111,9 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64> addrspace(1)* %in
-  %b = load <2 x i64> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
   %result = lshr <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -111,6 +125,11 @@ define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 ; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
 ; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
@@ -158,9 +177,9 @@ define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64> addrspace(1)* %in
-  %b = load <4 x i64> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
   %result = lshr <4 x i64> %a, %b
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/ssubo.ll b/test/CodeGen/R600/ssubo.ll
index 09d3959b2b3d..26884a1b7761 100644
--- a/test/CodeGen/R600/ssubo.ll
+++ b/test/CodeGen/R600/ssubo.ll
@@ -28,8 +28,8 @@ define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; FUNC-LABEL: {{^}}v_ssubo_i32:
 define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32 addrspace(1)* %aptr, align 4
-  %b = load i32 addrspace(1)* %bptr, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
   %carry = extractvalue { i32, i1 } %ssub, 1
@@ -54,8 +54,8 @@ define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
 define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64 addrspace(1)* %aptr, align 4
-  %b = load i64 addrspace(1)* %bptr, align 4
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
diff --git a/test/CodeGen/R600/store-barrier.ll b/test/CodeGen/R600/store-barrier.ll
index 350b006ba5e0..4a72b4d090ad 100644
--- a/test/CodeGen/R600/store-barrier.ll
+++ b/test/CodeGen/R600/store-barrier.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
-; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
 
 ; This test is for a bug in the machine scheduler where stores without
 ; an underlying object would be moved across the barrier.  In this
@@ -14,24 +14,24 @@
 ; Function Attrs: nounwind
 define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) {
 bb:
-  %tmp10 = getelementptr inbounds i32 addrspace(1)* %arg2, i64 %tmp9
-  %tmp13 = load i32 addrspace(1)* %tmp10, align 2
-  %tmp14 = getelementptr inbounds <2 x i8> addrspace(3)* %arg, i32 %tmp13
-  %tmp15 = load <2 x i8> addrspace(3)* %tmp14, align 2
+  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
+  %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
+  %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13
+  %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2
   %tmp16 = add i32 %tmp13, 1
-  %tmp17 = getelementptr inbounds <2 x i8> addrspace(3)* %arg, i32 %tmp16
+  %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16
   store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2
   tail call void @llvm.AMDGPU.barrier.local() #2
-  %tmp25 = load i32 addrspace(1)* %tmp10, align 4
+  %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4
   %tmp26 = sext i32 %tmp25 to i64
   %tmp27 = sext i32 %arg4 to i64
-  %tmp28 = getelementptr inbounds <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4
-  %tmp29 = load i8 addrspace(3)* %tmp28, align 1
-  %tmp30 = getelementptr inbounds <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27
+  %tmp28 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4
+  %tmp29 = load i8, i8 addrspace(3)* %tmp28, align 1
+  %tmp30 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27
   store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1
-  %tmp32 = getelementptr inbounds <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0
-  %tmp33 = load i8 addrspace(3)* %tmp32, align 1
-  %tmp35 = getelementptr inbounds <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0
+  %tmp32 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0
+  %tmp33 = load i8, i8 addrspace(3)* %tmp32, align 1
+  %tmp35 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0
   store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1
   ret void
 }
diff --git a/test/CodeGen/R600/store-v3i64.ll b/test/CodeGen/R600/store-v3i64.ll
index 4db9b67e0118..e0c554ad2c17 100644
--- a/test/CodeGen/R600/store-v3i64.ll
+++ b/test/CodeGen/R600/store-v3i64.ll
@@ -1,6 +1,6 @@
 ; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}global_store_v3i64:
 ; SI: buffer_store_dwordx4
diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
index ba4d94f73245..d5af3b29118a 100644
--- a/test/CodeGen/R600/store-vector-ptrs.ll
+++ b/test/CodeGen/R600/store-vector-ptrs.ll
@@ -6,7 +6,7 @@
 ; scratch loads and stores.
 ; CHECK-LABEL: {{^}}store_vector_ptrs:
 define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
-  %p = getelementptr <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   store <4 x i32*> %p, <4 x i32*>* %out
   ret void
 }
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index e4cb31365786..0f89405e073b 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -16,7 +16,7 @@ entry:
 }
 
 ; i8 store
-; EG-LABEL: {{^}}store_i8:
+; FUNC-LABEL: {{^}}store_i8:
 ; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
 
 ; IG 0: Get the byte index and truncate the value
@@ -37,7 +37,6 @@ entry:
 ; EG: MOV T[[RW_GPR]].Y, 0.0
 ; EG: MOV * T[[RW_GPR]].Z, 0.0
 
-; SI-LABEL: {{^}}store_i8:
 ; SI: buffer_store_byte
 
 define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
@@ -47,7 +46,7 @@ entry:
 }
 
 ; i16 store
-; EG-LABEL: {{^}}store_i16:
+; FUNC-LABEL: {{^}}store_i16:
 ; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
 
 ; IG 0: Get the byte index and truncate the value
@@ -71,7 +70,6 @@ entry:
 ; EG: MOV T[[RW_GPR]].Y, 0.0
 ; EG: MOV * T[[RW_GPR]].Z, 0.0
 
-; SI-LABEL: {{^}}store_i16:
 ; SI: buffer_store_short
 define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
 entry:
@@ -79,10 +77,10 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}store_v2i8:
+; FUNC-LABEL: {{^}}store_v2i8:
 ; EG: MEM_RAT MSKOR
 ; EG-NOT: MEM_RAT MSKOR
-; SI-LABEL: {{^}}store_v2i8:
+
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
@@ -93,11 +91,11 @@ entry:
 }
 
 
-; EG-LABEL: {{^}}store_v2i16:
+; FUNC-LABEL: {{^}}store_v2i16:
 ; EG: MEM_RAT_CACHELESS STORE_RAW
-; CM-LABEL: {{^}}store_v2i16:
+
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
-; SI-LABEL: {{^}}store_v2i16:
+
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
@@ -107,11 +105,11 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}store_v4i8:
+; FUNC-LABEL: {{^}}store_v4i8:
 ; EG: MEM_RAT_CACHELESS STORE_RAW
-; CM-LABEL: {{^}}store_v4i8:
+
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
-; SI-LABEL: {{^}}store_v4i8:
+
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
@@ -124,11 +122,11 @@ entry:
 }
 
 ; floating-point store
-; EG-LABEL: {{^}}store_f32:
+; FUNC-LABEL: {{^}}store_f32:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
-; CM-LABEL: {{^}}store_f32:
+
 ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-LABEL: {{^}}store_f32:
+
 ; SI: buffer_store_dword
 
 define void @store_f32(float addrspace(1)* %out, float %in) {
@@ -136,13 +134,13 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
   ret void
 }
 
-; EG-LABEL: {{^}}store_v4i16:
+; FUNC-LABEL: {{^}}store_v4i16:
 ; EG: MEM_RAT MSKOR
 ; EG: MEM_RAT MSKOR
 ; EG: MEM_RAT MSKOR
 ; EG: MEM_RAT MSKOR
 ; EG-NOT: MEM_RAT MSKOR
-; SI-LABEL: {{^}}store_v4i16:
+
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI: buffer_store_short
@@ -156,11 +154,11 @@ entry:
 }
 
 ; vec2 floating-point stores
-; EG-LABEL: {{^}}store_v2f32:
+; FUNC-LABEL: {{^}}store_v2f32:
 ; EG: MEM_RAT_CACHELESS STORE_RAW
-; CM-LABEL: {{^}}store_v2f32:
+
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
-; SI-LABEL: {{^}}store_v2f32:
+
 ; SI: buffer_store_dwordx2
 
 define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
@@ -171,13 +169,13 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}store_v4i32:
+; FUNC-LABEL: {{^}}store_v4i32:
 ; EG: MEM_RAT_CACHELESS STORE_RAW
 ; EG-NOT: MEM_RAT_CACHELESS STORE_RAW
-; CM-LABEL: {{^}}store_v4i32:
+
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
-; SI-LABEL: {{^}}store_v4i32:
+
 ; SI: buffer_store_dwordx4
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
@@ -218,29 +216,29 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}store_local_i8:
+; FUNC-LABEL: {{^}}store_local_i8:
 ; EG: LDS_BYTE_WRITE
-; SI-LABEL: {{^}}store_local_i8:
+
 ; SI: ds_write_b8
 define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
   store i8 %in, i8 addrspace(3)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}store_local_i16:
+; FUNC-LABEL: {{^}}store_local_i16:
 ; EG: LDS_SHORT_WRITE
-; SI-LABEL: {{^}}store_local_i16:
+
 ; SI: ds_write_b16
 define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
   store i16 %in, i16 addrspace(3)* %out
   ret void
 }
 
-; EG-LABEL: {{^}}store_local_v2i16:
+; FUNC-LABEL: {{^}}store_local_v2i16:
 ; EG: LDS_WRITE
-; CM-LABEL: {{^}}store_local_v2i16:
+
 ; CM: LDS_WRITE
-; SI-LABEL: {{^}}store_local_v2i16:
+
 ; SI: ds_write_b16
 ; SI: ds_write_b16
 define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
@@ -249,11 +247,11 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}store_local_v4i8:
+; FUNC-LABEL: {{^}}store_local_v4i8:
 ; EG: LDS_WRITE
-; CM-LABEL: {{^}}store_local_v4i8:
+
 ; CM: LDS_WRITE
-; SI-LABEL: {{^}}store_local_v4i8:
+
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: ds_write_b8
@@ -264,13 +262,13 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}store_local_v2i32:
+; FUNC-LABEL: {{^}}store_local_v2i32:
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-; CM-LABEL: {{^}}store_local_v2i32:
+
 ; CM: LDS_WRITE
 ; CM: LDS_WRITE
-; SI-LABEL: {{^}}store_local_v2i32:
+
 ; SI: ds_write_b64
 define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
 entry:
@@ -278,17 +276,17 @@ entry:
   ret void
 }
 
-; EG-LABEL: {{^}}store_local_v4i32:
+; FUNC-LABEL: {{^}}store_local_v4i32:
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-; CM-LABEL: {{^}}store_local_v4i32:
+
 ; CM: LDS_WRITE
 ; CM: LDS_WRITE
 ; CM: LDS_WRITE
 ; CM: LDS_WRITE
-; SI-LABEL: {{^}}store_local_v4i32:
+
 ; SI: ds_write_b32
 ; SI: ds_write_b32
 ; SI: ds_write_b32
@@ -326,19 +324,19 @@ entry:
 ; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
 ; be two 32-bit stores.
 
-; EG-LABEL: {{^}}vecload2:
+; FUNC-LABEL: {{^}}vecload2:
 ; EG: MEM_RAT_CACHELESS STORE_RAW
-; CM-LABEL: {{^}}vecload2:
+
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
-; SI-LABEL: {{^}}vecload2:
+
 ; SI: buffer_store_dwordx2
 define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
-  %0 = load i32 addrspace(2)* %mem, align 4
-  %arrayidx1.i = getelementptr inbounds i32 addrspace(2)* %mem, i64 1
-  %1 = load i32 addrspace(2)* %arrayidx1.i, align 4
+  %0 = load i32, i32 addrspace(2)* %mem, align 4
+  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
+  %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4
   store i32 %0, i32 addrspace(1)* %out, align 4
-  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i64 1
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
   ret void
 }
@@ -357,16 +355,15 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 ; CM: STORE_DWORD
 ; CM: STORE_DWORD
 ; CM: STORE_DWORD
-; SI: buffer_store_dwordx2
-; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx4
 define void @i128-const-store(i32 addrspace(1)* %out) {
 entry:
   store i32 1, i32 addrspace(1)* %out, align 4
-  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i64 1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 1, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds i32 addrspace(1)* %out, i64 2
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
   store i32 2, i32 addrspace(1)* %arrayidx4, align 4
-  %arrayidx6 = getelementptr inbounds i32 addrspace(1)* %out, i64 3
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
   store i32 2, i32 addrspace(1)* %arrayidx6, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/store.r600.ll b/test/CodeGen/R600/store.r600.ll
index 21972603cac9..696fb033b5ec 100644
--- a/test/CodeGen/R600/store.r600.ll
+++ b/test/CodeGen/R600/store.r600.ll
@@ -7,7 +7,7 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %1 = load <4 x i32> addrspace(1) * %in
+  %1 = load <4 x i32>, <4 x i32> addrspace(1) * %in
   store <4 x i32> %1, <4 x i32> addrspace(1)* %out
   ret void
 }
@@ -16,7 +16,7 @@ define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 ; EG: {{^}}store_v4f32:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %1 = load <4 x float> addrspace(1) * %in
+  %1 = load <4 x float>, <4 x float> addrspace(1) * %in
   store <4 x float> %1, <4 x float> addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index be48e186e870..b7fba0efa5b2 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -9,9 +9,9 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 
 ; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %a = load i32 addrspace(1)* %in
-  %b = load i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = sub i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -26,9 +26,9 @@ define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
   %result = sub <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -46,9 +46,9 @@ define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)
 ; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
   %result = sub <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -58,11 +58,13 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)
 ; SI: s_sub_u32
 ; SI: s_subb_u32
 
-; EG-DAG: SETGE_UINT
-; EG-DAG: CNDE_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
 define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
   %result = sub i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -73,17 +75,19 @@ define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
 
-; EG-DAG: SETGE_UINT
-; EG-DAG: CNDE_INT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
 define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr i64 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i64 addrspace(1)* %inB, i32 %tid
-  %a = load i64 addrspace(1)* %a_ptr
-  %b = load i64 addrspace(1)* %b_ptr
+  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a_ptr
+  %b = load i64, i64 addrspace(1)* %b_ptr
   %result = sub i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -96,10 +100,10 @@ define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias
 ; SI: v_subb_u32_e32
 define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr <2 x i64> addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr <2 x i64> addrspace(1)* %inB, i32 %tid
-  %a = load <2 x i64> addrspace(1)* %a_ptr
-  %b = load <2 x i64> addrspace(1)* %b_ptr
+  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
   %result = sub <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -116,10 +120,10 @@ define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(
 ; SI: v_subb_u32_e32
 define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
-  %a_ptr = getelementptr <4 x i64> addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr <4 x i64> addrspace(1)* %inB, i32 %tid
-  %a = load <4 x i64> addrspace(1)* %a_ptr
-  %b = load <4 x i64> addrspace(1)* %b_ptr
+  %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
   %result = sub <4 x i64> %a, %b
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/subreg-coalescer-crash.ll b/test/CodeGen/R600/subreg-coalescer-crash.ll
index a9eec7908b6c..c4dae4736cfa 100644
--- a/test/CodeGen/R600/subreg-coalescer-crash.ll
+++ b/test/CodeGen/R600/subreg-coalescer-crash.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s
-; ModuleID = 'bugpoint-reduced-simplified.bc'
 
+; SI-LABEL:{{^}}row_filter_C1_D0:
 ; SI: s_endpgm
 ; Function Attrs: nounwind
-define void @row_filter_C1_D0() #0 {
+define void @row_filter_C1_D0() {
 entry:
   br i1 undef, label %for.inc.1, label %do.body.preheader
 
@@ -42,3 +42,68 @@ for.inc.1:                                        ; preds = %do.body.1562.prehea
   unreachable
 }
 
+; SI-LABEL: {{^}}foo:
+; SI: s_endpgm
+define void @foo() #0 {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb4, label %bb6
+
+bb2:                                              ; preds = %bb4, %bb
+  %tmp = phi float [ %tmp5, %bb4 ], [ 0.000000e+00, %bb ]
+  br i1 undef, label %bb9, label %bb13
+
+bb4:                                              ; preds = %bb7, %bb6, %bb1
+  %tmp5 = phi float [ undef, %bb1 ], [ undef, %bb6 ], [ %tmp8, %bb7 ]
+  br label %bb2
+
+bb6:                                              ; preds = %bb1
+  br i1 undef, label %bb7, label %bb4
+
+bb7:                                              ; preds = %bb6
+  %tmp8 = fmul float undef, undef
+  br label %bb4
+
+bb9:                                              ; preds = %bb2
+  %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2)
+  %tmp11 = extractelement <4 x float> %tmp10, i32 1
+  %tmp12 = extractelement <4 x float> %tmp10, i32 3
+  br label %bb14
+
+bb13:                                             ; preds = %bb2
+  br i1 undef, label %bb23, label %bb24
+
+bb14:                                             ; preds = %bb27, %bb24, %bb9
+  %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ]
+  %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ]
+  %tmp17 = fmul float 10.5, %tmp16
+  %tmp18 = fmul float 11.5, %tmp15
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17)
+  ret void
+
+bb23:                                             ; preds = %bb13
+  br i1 undef, label %bb24, label %bb26
+
+bb24:                                             ; preds = %bb26, %bb23, %bb13
+  %tmp25 = phi float [ %tmp, %bb13 ], [ %tmp, %bb26 ], [ 0.000000e+00, %bb23 ]
+  br i1 undef, label %bb27, label %bb14
+
+bb26:                                             ; preds = %bb23
+  br label %bb24
+
+bb27:                                             ; preds = %bb24
+  br label %bb14
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/swizzle-export.ll b/test/CodeGen/R600/swizzle-export.ll
index 5eaca7675237..000ee2faa478 100644
--- a/test/CodeGen/R600/swizzle-export.ll
+++ b/test/CodeGen/R600/swizzle-export.ll
@@ -12,56 +12,56 @@ main_body:
   %1 = extractelement <4 x float> %reg1, i32 1
   %2 = extractelement <4 x float> %reg1, i32 2
   %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = load <4 x float> addrspace(8)* null
+  %4 = load <4 x float>, <4 x float> addrspace(8)* null
   %5 = extractelement <4 x float> %4, i32 1
-  %6 = load <4 x float> addrspace(8)* null
+  %6 = load <4 x float>, <4 x float> addrspace(8)* null
   %7 = extractelement <4 x float> %6, i32 2
-  %8 = load <4 x float> addrspace(8)* null
+  %8 = load <4 x float>, <4 x float> addrspace(8)* null
   %9 = extractelement <4 x float> %8, i32 0
   %10 = fmul float 0.000000e+00, %9
-  %11 = load <4 x float> addrspace(8)* null
+  %11 = load <4 x float>, <4 x float> addrspace(8)* null
   %12 = extractelement <4 x float> %11, i32 0
   %13 = fmul float %5, %12
-  %14 = load <4 x float> addrspace(8)* null
+  %14 = load <4 x float>, <4 x float> addrspace(8)* null
   %15 = extractelement <4 x float> %14, i32 0
   %16 = fmul float 0.000000e+00, %15
-  %17 = load <4 x float> addrspace(8)* null
+  %17 = load <4 x float>, <4 x float> addrspace(8)* null
   %18 = extractelement <4 x float> %17, i32 0
   %19 = fmul float 0.000000e+00, %18
-  %20 = load <4 x float> addrspace(8)* null
+  %20 = load <4 x float>, <4 x float> addrspace(8)* null
   %21 = extractelement <4 x float> %20, i32 0
   %22 = fmul float %7, %21
-  %23 = load <4 x float> addrspace(8)* null
+  %23 = load <4 x float>, <4 x float> addrspace(8)* null
   %24 = extractelement <4 x float> %23, i32 0
   %25 = fmul float 0.000000e+00, %24
-  %26 = load <4 x float> addrspace(8)* null
+  %26 = load <4 x float>, <4 x float> addrspace(8)* null
   %27 = extractelement <4 x float> %26, i32 0
   %28 = fmul float 0.000000e+00, %27
-  %29 = load <4 x float> addrspace(8)* null
+  %29 = load <4 x float>, <4 x float> addrspace(8)* null
   %30 = extractelement <4 x float> %29, i32 0
   %31 = fmul float 0.000000e+00, %30
-  %32 = load <4 x float> addrspace(8)* null
+  %32 = load <4 x float>, <4 x float> addrspace(8)* null
   %33 = extractelement <4 x float> %32, i32 0
   %34 = fmul float 0.000000e+00, %33
-  %35 = load <4 x float> addrspace(8)* null
+  %35 = load <4 x float>, <4 x float> addrspace(8)* null
   %36 = extractelement <4 x float> %35, i32 0
   %37 = fmul float 0.000000e+00, %36
-  %38 = load <4 x float> addrspace(8)* null
+  %38 = load <4 x float>, <4 x float> addrspace(8)* null
   %39 = extractelement <4 x float> %38, i32 0
   %40 = fmul float 1.000000e+00, %39
-  %41 = load <4 x float> addrspace(8)* null
+  %41 = load <4 x float>, <4 x float> addrspace(8)* null
   %42 = extractelement <4 x float> %41, i32 0
   %43 = fmul float 0.000000e+00, %42
-  %44 = load <4 x float> addrspace(8)* null
+  %44 = load <4 x float>, <4 x float> addrspace(8)* null
   %45 = extractelement <4 x float> %44, i32 0
   %46 = fmul float 0.000000e+00, %45
-  %47 = load <4 x float> addrspace(8)* null
+  %47 = load <4 x float>, <4 x float> addrspace(8)* null
   %48 = extractelement <4 x float> %47, i32 0
   %49 = fmul float 0.000000e+00, %48
-  %50 = load <4 x float> addrspace(8)* null
+  %50 = load <4 x float>, <4 x float> addrspace(8)* null
   %51 = extractelement <4 x float> %50, i32 0
   %52 = fmul float 0.000000e+00, %51
-  %53 = load <4 x float> addrspace(8)* null
+  %53 = load <4 x float>, <4 x float> addrspace(8)* null
   %54 = extractelement <4 x float> %53, i32 0
   %55 = fmul float 1.000000e+00, %54
   %56 = insertelement <4 x float> undef, float %0, i32 0
@@ -102,12 +102,12 @@ main_body:
   %1 = extractelement <4 x float> %reg1, i32 1
   %2 = fadd float %0, 2.5
   %3 = fmul float %1, 3.5
-  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %5 = extractelement <4 x float> %4, i32 0
   %6 = call float @llvm.cos.f32(float %5)
-  %7 = load <4 x float> addrspace(8)* null
+  %7 = load <4 x float>, <4 x float> addrspace(8)* null
   %8 = extractelement <4 x float> %7, i32 0
-  %9 = load <4 x float> addrspace(8)* null
+  %9 = load <4 x float>, <4 x float> addrspace(8)* null
   %10 = extractelement <4 x float> %9, i32 1
   %11 = insertelement <4 x float> undef, float %2, i32 0
   %12 = insertelement <4 x float> %11, float %3, i32 1
diff --git a/test/CodeGen/R600/trunc-cmp-constant.ll b/test/CodeGen/R600/trunc-cmp-constant.ll
index 67a9aaffb6ff..dac74728b3ce 100644
--- a/test/CodeGen/R600/trunc-cmp-constant.ll
+++ b/test/CodeGen/R600/trunc-cmp-constant.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0:
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[TMP]], 1{{$}}
-; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1{{$}}
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}}
 ; SI: v_cndmask_b32_e64
 ; SI: buffer_store_byte
 define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
   store i1 %cmp, i1 addrspace(1)* %out
@@ -20,12 +20,12 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_0:
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 1{{$}}
-; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], [[CMP0]], -1
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI-NEXT: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
   store i1 %cmp, i1 addrspace(1)* %out
@@ -34,9 +34,9 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
 
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
   store i1 %cmp, i1 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI-NEXT: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
   store i1 %cmp, i1 addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI-NEXT: buffer_store_byte [[RESULT]]
 define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
   store i1 %cmp, i1 addrspace(1)* %out
@@ -69,9 +69,9 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
 
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
   store i1 %cmp, i1 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI-NEXT: buffer_store_byte [[RESULT]]
 define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
   store i1 %cmp, i1 addrspace(1)* %out
@@ -96,7 +96,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI-NEXT: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
   store i1 %cmp, i1 addrspace(1)* %out
@@ -105,9 +105,9 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
 
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
   store i1 %cmp, i1 addrspace(1)* %out
@@ -117,12 +117,12 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_1:
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 1{{$}}
-; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], [[CMP0]], -1
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI-NEXT: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
   store i1 %cmp, i1 addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
 ; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
 ; XSI-NEXT: buffer_store_byte [[RESULT]]
 define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
   store i1 %cmp, i1 addrspace(1)* %out
@@ -146,9 +146,9 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
 
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
 define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1 addrspace(1)* %in
+  %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
   store i1 %cmp, i1 addrspace(1)* %out
@@ -157,11 +157,11 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
 
 ; FUNC-LABEL: {{^}}masked_load_i1_to_i32_trunc_cmp_ne_neg1:
 ; SI: buffer_load_sbyte [[LOAD:v[0-9]+]]
-; SI: v_cmp_ne_i32_e64 {{s\[[0-9]+:[0-9]+\]}}, [[LOAD]], -1{{$}}
+; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}}
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI-NEXT: buffer_store_byte
 define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %load = load i8 addrspace(1)* %in
+  %load = load i8, i8 addrspace(1)* %in
   %masked = and i8 %load, 255
   %ext = sext i8 %masked to i32
   %cmp = icmp ne i32 %ext, -1
diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
index bc00db7dbeef..bf690ca4cb28 100644
--- a/test/CodeGen/R600/trunc.ll
+++ b/test/CodeGen/R600/trunc.ll
@@ -36,6 +36,8 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
 ; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
 ; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
 ; SI: s_addc_u32
+; SI: v_mov_b32_e32
+; SI: v_mov_b32_e32
 ; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
 ; SI: buffer_store_dword v[[LO_VREG]],
 define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
@@ -51,7 +53,7 @@ define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI: v_cmp_eq_i32
 define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
-  %a = load i32 addrspace(1)* %ptr, align 4
+  %a = load i32, i32 addrspace(1)* %ptr, align 4
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -71,8 +73,8 @@ define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
 ; SI-LABEL: {{^}}s_trunc_i64_to_i1:
 ; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]]
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[MASKED]], 1
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, [[CMP]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
 define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
@@ -83,13 +85,13 @@ define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
 ; SI-LABEL: {{^}}v_trunc_i64_to_i1:
 ; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}}
 ; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
-; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[MASKED]], 1
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, [[CMP]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
 define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr i64 addrspace(1)* %in, i32 %tid
-  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
-  %x = load i64 addrspace(1)* %gep
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %x = load i64, i64 addrspace(1)* %gep
 
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
diff --git a/test/CodeGen/R600/tti-unroll-prefs.ll b/test/CodeGen/R600/tti-unroll-prefs.ll
index 0009c42f79bc..76c32afc1f21 100644
--- a/test/CodeGen/R600/tti-unroll-prefs.ll
+++ b/test/CodeGen/R600/tti-unroll-prefs.ll
@@ -39,7 +39,7 @@ if.then4:                                         ; preds = %if.then4.lr.ph, %if
   %add2 = add nsw i32 %b.addr.014, 1
   %1 = sext i32 %b.addr.014 to i64
   %add.ptr.sum = add nsw i64 %1, %0
-  %add.ptr5 = getelementptr inbounds i8 addrspace(1)* %dst, i64 %add.ptr.sum
+  %add.ptr5 = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %add.ptr.sum
   store i8 0, i8 addrspace(1)* %add.ptr5, align 1
   %inc = add nsw i32 %i.015, 1
   %cmp1 = icmp slt i32 %inc, 4
diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll
index 57d7835f99fb..11438f267ad0 100644
--- a/test/CodeGen/R600/uaddo.ll
+++ b/test/CodeGen/R600/uaddo.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
@@ -9,6 +9,9 @@ declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
 ; SI: add
 ; SI: addc
 ; SI: addc
+
+; EG: ADDC_UINT
+; EG: ADDC_UINT
 define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -21,6 +24,9 @@ define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; FUNC-LABEL: {{^}}s_uaddo_i32:
 ; SI: s_add_i32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
 define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %uadd, 0
@@ -32,9 +38,12 @@ define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; FUNC-LABEL: {{^}}v_uaddo_i32:
 ; SI: v_add_i32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
 define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32 addrspace(1)* %aptr, align 4
-  %b = load i32 addrspace(1)* %bptr, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -46,6 +55,9 @@ define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 ; FUNC-LABEL: {{^}}s_uaddo_i64:
 ; SI: s_add_u32
 ; SI: s_addc_u32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
 define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -58,9 +70,12 @@ define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_uaddo_i64:
 ; SI: v_add_i32
 ; SI: v_addc_u32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
 define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64 addrspace(1)* %aptr, align 4
-  %b = load i64 addrspace(1)* %bptr, align 4
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index 0c2c65bb7bf6..de22a22e5029 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -7,9 +7,9 @@
 ;EG: CF_END
 
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %a = load i32 addrspace(1) * %in
-  %b = load i32 addrspace(1) * %b_ptr
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1) * %in
+  %b = load i32, i32 addrspace(1) * %b_ptr
   %result = udiv i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -25,9 +25,9 @@ define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ;SI: s_endpgm
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
   %result = udiv <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -39,9 +39,9 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ;SI: s_endpgm
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
   %result = udiv <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/udivrem.ll b/test/CodeGen/R600/udivrem.ll
index b439d7aa892a..b3837f28209a 100644
--- a/test/CodeGen/R600/udivrem.ll
+++ b/test/CodeGen/R600/udivrem.ll
@@ -118,7 +118,7 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
@@ -141,7 +141,7 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
@@ -268,7 +268,7 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
@@ -291,7 +291,7 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
@@ -314,7 +314,7 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]]
 ; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[THIRD_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[THIRD_Num_S_Remainder]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}}
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]]
@@ -335,20 +335,6 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
 ; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
 ; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FOURTH_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[FOURTH_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_sub_i32_e32 [[FOURTH_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FOURTH_Num_S_Remainder]]
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[FOURTH_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[FOURTH_Quotient_A_One:v[0-9]+]], {{.*}}, [[FOURTH_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[FOURTH_Quotient_S_One:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[FOURTH_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[FOURTH_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
 define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
   %result0 = udiv <4 x i32> %x, %y
diff --git a/test/CodeGen/R600/udivrem24.ll b/test/CodeGen/R600/udivrem24.ll
index 4b98ac67b220..4de881b66f10 100644
--- a/test/CodeGen/R600/udivrem24.ll
+++ b/test/CodeGen/R600/udivrem24.ll
@@ -13,9 +13,9 @@
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
 define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
-  %num = load i8 addrspace(1) * %in
-  %den = load i8 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
   %result = udiv i8 %num, %den
   store i8 %result, i8 addrspace(1)* %out
   ret void
@@ -32,9 +32,9 @@ define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
 define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
-  %num = load i16 addrspace(1) * %in, align 2
-  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
   %result = udiv i16 %num, %den
   store i16 %result, i16 addrspace(1)* %out, align 2
   ret void
@@ -51,9 +51,9 @@ define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
 define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = lshr i32 %num.i24.0, 8
@@ -71,9 +71,9 @@ define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = lshr i32 %num.i24.0, 7
@@ -91,9 +91,9 @@ define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = lshr i32 %num.i24.0, 8
@@ -111,9 +111,9 @@ define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = lshr i32 %num.i24.0, 7
@@ -134,9 +134,9 @@ define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
 define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
-  %num = load i8 addrspace(1) * %in
-  %den = load i8 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
   %result = urem i8 %num, %den
   store i8 %result, i8 addrspace(1)* %out
   ret void
@@ -153,9 +153,9 @@ define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
 define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
-  %num = load i16 addrspace(1) * %in, align 2
-  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
   %result = urem i16 %num, %den
   store i16 %result, i16 addrspace(1)* %out, align 2
   ret void
@@ -172,9 +172,9 @@ define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
 define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = lshr i32 %num.i24.0, 8
@@ -192,9 +192,9 @@ define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = lshr i32 %num.i24.0, 7
@@ -212,9 +212,9 @@ define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = lshr i32 %num.i24.0, 8
@@ -232,9 +232,9 @@ define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
 define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %num = load i32 addrspace(1) * %in, align 4
-  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = lshr i32 %num.i24.0, 7
diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
index 77922fe8dab6..9f3069bdf80c 100644
--- a/test/CodeGen/R600/udivrem64.ll
+++ b/test/CodeGen/R600/udivrem64.ll
@@ -1,5 +1,5 @@
-;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
 ;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
 ;FUNC-LABEL: {{^}}test_udiv:
@@ -35,7 +35,41 @@
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: BFE_UINT
-;SI: s_endpgm
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
 define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
@@ -75,9 +109,115 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: AND_INT {{.*}}, 1,
-;SI: s_endpgm
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
 define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
+
+;FUNC-LABEL: {{^}}test_udiv3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 33
+  %2 = lshr i64 %y, 33
+  %result = udiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 33
+  %2 = lshr i64 %y, 33
+  %result = urem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_udiv2464:
+;EG: UINT_TO_FLT
+;EG: UINT_TO_FLT
+;EG: FLT_TO_UINT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: v_mad_f32
+;GCN: s_endpgm
+define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 40
+  %2 = lshr i64 %y, 40
+  %result = udiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem2464:
+;EG: UINT_TO_FLT
+;EG: UINT_TO_FLT
+;EG: FLT_TO_UINT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: v_mad_f32
+;GCN: s_endpgm
+define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 40
+  %2 = lshr i64 %y, 40
+  %result = urem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
index 09e987dd14da..dfec8eb15cb7 100644
--- a/test/CodeGen/R600/uint_to_fp.f64.ll
+++ b/test/CodeGen/R600/uint_to_fp.f64.ll
@@ -11,8 +11,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: buffer_store_dwordx2 [[RESULT]]
 define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  %gep = getelementptr i64 addrspace(1)* %in, i32 %tid
-  %val = load i64 addrspace(1)* %gep, align 8
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep, align 8
   %result = uitofp i64 %val to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -70,12 +70,13 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
   ret void
 }
 
+; FIXME: select on 0, 0
 ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
 ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
 ; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
 ; uses an SGPR for [[CMP]]
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index cf14c25759f7..00fea80b1bc8 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -38,7 +38,7 @@ define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32>
 ; SI: v_cvt_f32_u32_e32
 ; SI: s_endpgm
 define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %value = load <4 x i32> addrspace(1) * %in
+  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32>
 ; R600: MULADD_IEEE
 ; SI: v_cvt_f32_u32_e32
 ; SI: v_cvt_f32_u32_e32
-; SI: v_mad_f32
+; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000
 ; SI: s_endpgm
 define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) {
 entry:
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
index 665dc37c200a..82d88ebd3ae7 100644
--- a/test/CodeGen/R600/unaligned-load-store.ll
+++ b/test/CodeGen/R600/unaligned-load-store.ll
@@ -8,7 +8,7 @@
 ; SI: ds_write_b8
 ; SI: s_endpgm
 define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind {
-  %v = load i16 addrspace(3)* %p, align 1
+  %v = load i16, i16 addrspace(3)* %p, align 1
   store i16 %v, i16 addrspace(3)* %r, align 1
   ret void
 }
@@ -20,7 +20,7 @@ define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(
 ; SI: buffer_store_byte
 ; SI: s_endpgm
 define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind {
-  %v = load i16 addrspace(1)* %p, align 1
+  %v = load i16, i16 addrspace(1)* %p, align 1
   store i16 %v, i16 addrspace(1)* %r, align 1
   ret void
 }
@@ -36,7 +36,7 @@ define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace
 ; SI: ds_write_b8
 ; SI: s_endpgm
 define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
-  %v = load i32 addrspace(3)* %p, align 1
+  %v = load i32, i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
 }
@@ -51,7 +51,7 @@ define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
-  %v = load i32 addrspace(1)* %p, align 1
+  %v = load i32, i32 addrspace(1)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 1
   ret void
 }
@@ -75,7 +75,7 @@ define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace
 ; SI: ds_write_b8
 ; SI: s_endpgm
 define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
-  %v = load i64 addrspace(3)* %p, align 1
+  %v = load i64, i64 addrspace(3)* %p, align 1
   store i64 %v, i64 addrspace(3)* %r, align 1
   ret void
 }
@@ -98,7 +98,7 @@ define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
-  %v = load i64 addrspace(1)* %p, align 1
+  %v = load i64, i64 addrspace(1)* %p, align 1
   store i64 %v, i64 addrspace(1)* %r, align 1
   ret void
 }
@@ -145,7 +145,7 @@ define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace
 ; SI: ds_write_b8
 ; SI: s_endpgm
 define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
-  %v = load <4 x i32> addrspace(3)* %p, align 1
+  %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
 }
@@ -169,7 +169,7 @@ define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i
 ; FIXME-SI: buffer_load_ubyte
 ; FIXME-SI: buffer_load_ubyte
 define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
-  %v = load <4 x i32> addrspace(1)* %p, align 1
+  %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
   ret void
 }
@@ -178,7 +178,7 @@ define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x
 ; SI: ds_read2_b32
 ; SI: s_endpgm
 define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %val = load i64 addrspace(3)* %in, align 4
+  %val = load i64, i64 addrspace(3)* %in, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -187,21 +187,21 @@ define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspac
 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
 ; SI: s_endpgm
 define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %ptr = getelementptr i64 addrspace(3)* %in, i32 4
-  %val = load i64 addrspace(3)* %ptr, align 4
+  %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
+  %val = load i64, i64 addrspace(3)* %ptr, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:0 offset1:1
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
 ; SI: s_endpgm
 define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
-  %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255
+  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
-  %val = load i64 addrspace(3)* %ptri64, align 4
+  %val = load i64, i64 addrspace(3)* %ptri64, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -219,7 +219,7 @@ define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture
 ; SI: s_endpgm
 
 define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %val = load i64 addrspace(3)* %in, align 1
+  %val = load i64, i64 addrspace(3)* %in, align 1
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -236,18 +236,18 @@ define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
 ; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
 ; SI: s_endpgm
 define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
-  %ptr = getelementptr i64 addrspace(3)* %out, i32 4
+  %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
   store i64 0, i64 addrspace(3)* %ptr, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: s_endpgm
 define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
   %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
-  %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255
+  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
   store i64 0, i64 addrspace(3)* %out, align 4
   ret void
diff --git a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
index c615f0b84913..036a7e91b47f 100644
--- a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
+++ b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
@@ -20,20 +20,20 @@ for.body.lr.ph:                                   ; preds = %entry
 for.body:                                         ; preds = %for.body, %for.body.lr.ph
   %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
   %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
-  %1 = load i32 addrspace(1)* %0, align 4
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
   %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %3 = load i32 addrspace(1)* %2, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
   %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  %5 = load i32 addrspace(1)* %4, align 4
-  %add.ptr2 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %5 = load i32, i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
   %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
-  %7 = load i32 addrspace(1)* %6, align 4
-  %add.ptr3 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %7 = load i32, i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
   %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
-  %9 = load i32 addrspace(1)* %8, align 4
-  %add.ptr6 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 undef
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
   br i1 undef, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry
@@ -56,20 +56,20 @@ for.body.lr.ph:                                   ; preds = %entry
 for.body:                                         ; preds = %for.body, %for.body.lr.ph
   %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
   %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
-  %1 = load i32 addrspace(1)* %0, align 4
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
   %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %3 = load i32 addrspace(1)* %2, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
   %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  %5 = load i32 addrspace(1)* %4, align 4
-  %add.ptr2 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %5 = load i32, i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
   %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
-  %7 = load i32 addrspace(1)* %6, align 4
-  %add.ptr3 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %7 = load i32, i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
   %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
-  %9 = load i32 addrspace(1)* %8, align 4
-  %add.ptr6 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 undef
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
   br i1 undef, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry
@@ -92,20 +92,20 @@ for.body.lr.ph:                                   ; preds = %entry
 for.body:                                         ; preds = %for.body, %for.body.lr.ph
   %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
   %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
-  %1 = load i32 addrspace(1)* %0, align 4
-  %add.ptr = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
   %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %3 = load i32 addrspace(1)* %2, align 4
-  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
   %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  %5 = load i32 addrspace(1)* %4, align 4
-  %add.ptr2 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %5 = load i32, i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
   %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
-  %7 = load i32 addrspace(1)* %6, align 4
-  %add.ptr3 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %7 = load i32, i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
   %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
-  %9 = load i32 addrspace(1)* %8, align 4
-  %add.ptr6 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 undef
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
   br i1 undef, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry
diff --git a/test/CodeGen/R600/unroll.ll b/test/CodeGen/R600/unroll.ll
index e0035eae71cf..ca8d822ec7ed 100644
--- a/test/CodeGen/R600/unroll.ll
+++ b/test/CodeGen/R600/unroll.ll
@@ -20,7 +20,7 @@ loop.header:
   br label %loop.body
 
 loop.body:
-  %ptr = getelementptr [32 x i32]* %0, i32 0, i32 %counter
+  %ptr = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 %counter
   store i32 %counter, i32* %ptr
   br label %loop.inc
 
@@ -30,8 +30,8 @@ loop.inc:
   br i1 %1, label  %exit, label %loop.header
 
 exit:
-  %2 = getelementptr [32 x i32]* %0, i32 0, i32 5
-  %3 = load i32* %2
+  %2 = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 5
+  %3 = load i32, i32* %2
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index dce517fcd823..62841ec2d6c5 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -10,21 +10,36 @@
 ; SI: s_endpgm
 ; EG: CF_END
 define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
-  %a = load i32 addrspace(1)* %in
-  %b = load i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = urem i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_urem_i32_7:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925
+; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]]
+; SI: v_subrev_i32
+; SI: v_mul_lo_i32
+; SI: v_sub_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = urem i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_urem_v2i32:
 ; SI: s_endpgm
 ; EG: CF_END
 define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1)* %in
-  %b = load <2 x i32> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
   %result = urem <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -34,9 +49,9 @@ define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1
 ; SI: s_endpgm
 ; EG: CF_END
 define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1)* %in
-  %b = load <4 x i32> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
   %result = urem <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -46,9 +61,9 @@ define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1
 ; SI: s_endpgm
 ; EG: CF_END
 define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
-  %a = load i64 addrspace(1)* %in
-  %b = load i64 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1)* %in
+  %b = load i64, i64 addrspace(1)* %b_ptr
   %result = urem i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -58,9 +73,9 @@ define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; SI: s_endpgm
 ; EG: CF_END
 define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64> addrspace(1)* %in
-  %b = load <2 x i64> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
   %result = urem <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -70,9 +85,9 @@ define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1
 ; SI: s_endpgm
 ; EG: CF_END
 define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64> addrspace(1)* %in
-  %b = load <4 x i64> addrspace(1)* %b_ptr
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
   %result = urem <4 x i64> %a, %b
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/use-sgpr-multiple-times.ll b/test/CodeGen/R600/use-sgpr-multiple-times.ll
index 97d73ba74bc5..f26f30022b4f 100644
--- a/test/CodeGen/R600/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/R600/use-sgpr-multiple-times.ll
@@ -1,80 +1,87 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare float @llvm.fma.f32(float, float, float) #1
 declare float @llvm.fmuladd.f32(float, float, float) #1
 declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
 
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_binop:
-; SI: s_load_dword [[SGPR:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_twice_binop:
+; GCN: s_load_dword [[SGPR:s[0-9]+]],
+; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
   %dbl = fadd float %a, %a
   store float %dbl, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_three_ternary_op:
-; SI: s_load_dword [[SGPR:s[0-9]+]],
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op:
+; GCN: s_load_dword [[SGPR:s[0-9]+]],
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
 ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
-; SI: buffer_store_dword [[RESULT]]
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
 ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
-; SI: buffer_store_dword [[RESULT]]
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
 ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
-; SI: buffer_store_dword [[RESULT]]
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
-; SI: s_load_dword [[SGPR:s[0-9]+]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
-; SI: s_load_dword [[SGPR:s[0-9]+]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
@@ -82,10 +89,10 @@ define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, fl
 }
 
 ; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
-; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
-; SI: s_load_dword [[SGPR:s[0-9]+]]
-; SI: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
 define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 {
   %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1
   store i32 %fma, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/usubo.ll b/test/CodeGen/R600/usubo.ll
index be1e66673bc9..3c9b1622a076 100644
--- a/test/CodeGen/R600/usubo.ll
+++ b/test/CodeGen/R600/usubo.ll
@@ -1,11 +1,14 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
 
 ; FUNC-LABEL: {{^}}usubo_i64_zext:
+
+; EG: SUBB_UINT
+; EG: ADDC_UINT
 define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %usub, 0
@@ -18,6 +21,9 @@ define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; FUNC-LABEL: {{^}}s_usubo_i32:
 ; SI: s_sub_i32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
 define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %usub, 0
@@ -29,9 +35,12 @@ define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; FUNC-LABEL: {{^}}v_usubo_i32:
 ; SI: v_subrev_i32_e32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
 define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32 addrspace(1)* %aptr, align 4
-  %b = load i32 addrspace(1)* %bptr, align 4
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -43,6 +52,11 @@ define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 ; FUNC-LABEL: {{^}}s_usubo_i64:
 ; SI: s_sub_u32
 ; SI: s_subb_u32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG: SUB_INT
 define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %usub, 0
@@ -55,9 +69,14 @@ define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_usubo_i64:
 ; SI: v_sub_i32
 ; SI: v_subb_u32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG: SUB_INT
 define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64 addrspace(1)* %aptr, align 4
-  %b = load i64 addrspace(1)* %bptr, align 4
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
diff --git a/test/CodeGen/R600/v_cndmask.ll b/test/CodeGen/R600/v_cndmask.ll
index 85936ecda774..c368c5aaf7dc 100644
--- a/test/CodeGen/R600/v_cndmask.ll
+++ b/test/CodeGen/R600/v_cndmask.ll
@@ -10,8 +10,8 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; SI: s_endpgm
 define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
   %idx = call i32 @llvm.r600.read.tidig.x() #1
-  %f.gep = getelementptr float addrspace(1)* %fptr, i32 %idx
-  %f = load float addrspace(1)* %fptr
+  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
+  %f = load float, float addrspace(1)* %fptr
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
   store float %select, float addrspace(1)* %out
diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll
index a4027178431b..7d0ebd139f51 100644
--- a/test/CodeGen/R600/valu-i1.ll
+++ b/test/CodeGen/R600/valu-i1.ll
@@ -15,18 +15,18 @@ entry:
   ]
 
 case0:
-  %arrayidx1 = getelementptr i32 addrspace(1)* %dst, i32 %b
+  %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
   store i32 0, i32 addrspace(1)* %arrayidx1, align 4
   br label %end
 
 case1:
-  %arrayidx5 = getelementptr i32 addrspace(1)* %dst, i32 %b
+  %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
   store i32 1, i32 addrspace(1)* %arrayidx5, align 4
   br label %end
 
 default:
   %cmp8 = icmp eq i32 %a, 2
-  %arrayidx10 = getelementptr i32 addrspace(1)* %dst, i32 %b
+  %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
   br i1 %cmp8, label %if, label %else
 
 if:
@@ -42,8 +42,8 @@ end:
 }
 
 ; SI-LABEL: @simple_test_v_if
-; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
-; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
 
 ; SI: ; BB#1
@@ -59,7 +59,7 @@ define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1
   br i1 %is.0, label %store, label %exit
 
 store:
-  %gep = getelementptr i32 addrspace(1)* %dst, i32 %tid
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
   store i32 999, i32 addrspace(1)* %gep
   ret void
 
@@ -68,8 +68,8 @@ exit:
 }
 
 ; SI-LABEL: @simple_test_v_loop
-; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
-; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
 ; SI: s_cbranch_execz BB2_2
 
@@ -81,7 +81,6 @@ exit:
 ; SI: buffer_store_dword
 ; SI: v_cmp_eq_i32_e32 vcc,
 ; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
-; SI: v_add_i32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
 ; SI: s_cbranch_execnz BB2_3
 
@@ -94,9 +93,9 @@ entry:
 
 loop:
   %i = phi i32 [%tid, %entry], [%i.inc, %loop]
-  %gep.src = getelementptr i32 addrspace(1)* %src, i32 %i
-  %gep.dst = getelementptr i32 addrspace(1)* %dst, i32 %i
-  %load = load i32 addrspace(1)* %src
+  %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
+  %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
+  %load = load i32, i32 addrspace(1)* %src
   store i32 %load, i32 addrspace(1)* %gep.dst
   %i.inc = add nsw i32 %i, 1
   %cmp = icmp eq i32 %limit, %i.inc
@@ -112,8 +111,8 @@ exit:
 ; Branch to exit if uniformly not taken
 ; SI: ; BB#0:
 ; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
-; SI: v_cmp_gt_i32_e64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]]
-; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG]], [[OUTER_CMP_SREG]]
+; SI: v_cmp_lt_i32_e32 vcc
+; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
 ; SI: s_cbranch_execz BB3_2
 
@@ -124,10 +123,10 @@ exit:
 
 ; Clear exec bits for workitems that load -1s
 ; SI: BB3_3:
-; SI: buffer_load_dword [[A:v[0-9]+]]
 ; SI: buffer_load_dword [[B:v[0-9]+]]
-; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], [[A]], -1
-; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_1:s\[[0-9]+:[0-9]+\]]], [[B]], -1
+; SI: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
+; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
 ; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
 ; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
@@ -155,8 +154,8 @@ define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addr
 bb:
   %tmp = tail call i32 @llvm.r600.read.tidig.x() #0
   %tmp4 = sext i32 %tmp to i64
-  %tmp5 = getelementptr inbounds i32 addrspace(1)* %arg3, i64 %tmp4
-  %tmp6 = load i32 addrspace(1)* %tmp5, align 4
+  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
+  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
   %tmp7 = icmp sgt i32 %tmp6, 0
   %tmp8 = sext i32 %tmp6 to i64
   br i1 %tmp7, label %bb10, label %bb26
@@ -164,10 +163,10 @@ bb:
 bb10:                                             ; preds = %bb, %bb20
   %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
   %tmp12 = add nsw i64 %tmp11, %tmp4
-  %tmp13 = getelementptr inbounds i32 addrspace(1)* %arg1, i64 %tmp12
-  %tmp14 = load i32 addrspace(1)* %tmp13, align 4
-  %tmp15 = getelementptr inbounds i32 addrspace(1)* %arg2, i64 %tmp12
-  %tmp16 = load i32 addrspace(1)* %tmp15, align 4
+  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
+  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
+  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
+  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
   %tmp17 = icmp ne i32 %tmp14, -1
   %tmp18 = icmp ne i32 %tmp16, -1
   %tmp19 = and i1 %tmp17, %tmp18
@@ -175,7 +174,7 @@ bb10:                                             ; preds = %bb, %bb20
 
 bb20:                                             ; preds = %bb10
   %tmp21 = add nsw i32 %tmp16, %tmp14
-  %tmp22 = getelementptr inbounds i32 addrspace(1)* %arg, i64 %tmp12
+  %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
   store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
   %tmp23 = add nuw nsw i64 %tmp11, 1
   %tmp24 = icmp slt i64 %tmp23, %tmp8
diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll
index 228868aa7feb..6f3b4847fbdf 100644
--- a/test/CodeGen/R600/vector-alloca.ll
+++ b/test/CodeGen/R600/vector-alloca.ll
@@ -13,16 +13,16 @@
 define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
   store i32 0, i32* %x
   store i32 1, i32* %y
   store i32 2, i32* %z
   store i32 3, i32* %w
-  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %index
-  %2 = load i32* %1
+  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index
+  %2 = load i32, i32* %1
   store i32 %2, i32 addrspace(1)* %out
   ret void
 }
@@ -37,18 +37,18 @@ entry:
 define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
   store i32 0, i32* %x
   store i32 0, i32* %y
   store i32 0, i32* %z
   store i32 0, i32* %w
-  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %w_index
+  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index
   store i32 1, i32* %1
-  %2 = getelementptr [4 x i32]* %0, i32 0, i32 %r_index
-  %3 = load i32* %2
+  %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index
+  %3 = load i32, i32* %2
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
@@ -60,18 +60,18 @@ entry:
 define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
   store i32 0, i32* %x
   store i32 0, i32* %y
   store i32 0, i32* %z
   store i32 0, i32* %w
-  %1 = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
   %2 = bitcast i32* %1 to [4 x i32]*
-  %3 = getelementptr [4 x i32]* %2, i32 0, i32 0
-  %4 = load i32* %3
+  %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0
+  %4 = load i32, i32* %3
   store i32 %4, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/vertex-fetch-encoding.ll b/test/CodeGen/R600/vertex-fetch-encoding.ll
index e4d117f6310b..fb6a17e67146 100644
--- a/test/CodeGen/R600/vertex-fetch-encoding.ll
+++ b/test/CodeGen/R600/vertex-fetch-encoding.ll
@@ -8,7 +8,7 @@
 
 define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
-  %0 = load i32 addrspace(1)* %in
+  %0 = load i32, i32 addrspace(1)* %in
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
@@ -19,7 +19,7 @@ entry:
 
 define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
-  %0 = load <4 x i32> addrspace(1)* %in
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %0, <4 x i32> addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/vop-shrink.ll b/test/CodeGen/R600/vop-shrink.ll
index d5a46e38ce26..9b2f229c05af 100644
--- a/test/CodeGen/R600/vop-shrink.ll
+++ b/test/CodeGen/R600/vop-shrink.ll
@@ -15,7 +15,7 @@ entry:
   br i1 %tmp, label %if, label %else
 
 if:                                               ; preds = %entry
-  %tmp1 = getelementptr i32 addrspace(1)* %out, i32 1
+  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %tmp2 = extractelement <4 x i32> %sgpr, i32 1
   store i32 %tmp2, i32 addrspace(1)* %out
   br label %endif
diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
index a6152f7881ef..a3014b03d2b3 100644
--- a/test/CodeGen/R600/vselect.ll
+++ b/test/CodeGen/R600/vselect.ll
@@ -12,8 +12,8 @@
 
 define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
 entry:
-  %0 = load <2 x i32> addrspace(1)* %in0
-  %1 = load <2 x i32> addrspace(1)* %in1
+  %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
+  %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
   %cmp = icmp ne <2 x i32> %0, %1
   %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
@@ -30,8 +30,8 @@ entry:
 
 define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
-  %0 = load <2 x float> addrspace(1)* %in0
-  %1 = load <2 x float> addrspace(1)* %in1
+  %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
+  %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
   %cmp = fcmp une <2 x float> %0, %1
   %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
   store <2 x float> %result, <2 x float> addrspace(1)* %out
@@ -52,8 +52,8 @@ entry:
 
 define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
 entry:
-  %0 = load <4 x i32> addrspace(1)* %in0
-  %1 = load <4 x i32> addrspace(1)* %in1
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
   %cmp = icmp ne <4 x i32> %0, %1
   %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -68,8 +68,8 @@ entry:
 
 define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
 entry:
-  %0 = load <4 x float> addrspace(1)* %in0
-  %1 = load <4 x float> addrspace(1)* %in1
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %in1
   %cmp = fcmp une <4 x float> %0, %1
   %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
   store <4 x float> %result, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/R600/vtx-fetch-branch.ll b/test/CodeGen/R600/vtx-fetch-branch.ll
index bcbe34ea543b..4584d6e25254 100644
--- a/test/CodeGen/R600/vtx-fetch-branch.ll
+++ b/test/CodeGen/R600/vtx-fetch-branch.ll
@@ -16,7 +16,7 @@ entry:
   br i1 %0, label %endif, label %if
 
 if:
-  %1 = load i32 addrspace(1)* %in
+  %1 = load i32, i32 addrspace(1)* %in
   br label %endif
 
 endif:
diff --git a/test/CodeGen/R600/vtx-schedule.ll b/test/CodeGen/R600/vtx-schedule.ll
index 8254c9923477..912e258ebb83 100644
--- a/test/CodeGen/R600/vtx-schedule.ll
+++ b/test/CodeGen/R600/vtx-schedule.ll
@@ -11,8 +11,8 @@
 ; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
 define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
 entry:
-  %0 = load i32 addrspace(1)* addrspace(1)* %in0
-  %1 = load i32 addrspace(1)* %0
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
+  %1 = load i32, i32 addrspace(1)* %0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll
index 93cfdd46093e..5cc7577cad33 100644
--- a/test/CodeGen/R600/wait.ll
+++ b/test/CodeGen/R600/wait.ll
@@ -4,22 +4,21 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_load_dwordx4
 ; CHECK: s_load_dwordx4
-; CHECK: s_waitcnt lgkmcnt(0){{$}}
-; CHECK: s_waitcnt vmcnt(0){{$}}
-; CHECK: s_waitcnt expcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_endpgm
 define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8> addrspace(2)* %arg3, i32 0
-  %tmp10 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
+  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
   %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
   %tmp12 = extractelement <4 x float> %tmp11, i32 0
   %tmp13 = extractelement <4 x float> %tmp11, i32 1
   call void @llvm.AMDGPU.barrier.global() #1
   %tmp14 = extractelement <4 x float> %tmp11, i32 2
 ;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
-  %tmp15 = load float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
-  %tmp16 = getelementptr <16 x i8> addrspace(2)* %arg3, i32 1
-  %tmp17 = load <16 x i8> addrspace(2)* %tmp16, !tbaa !0
+  %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
+  %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1
+  %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0
   %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
   %tmp19 = extractelement <4 x float> %tmp18, i32 0
   %tmp20 = extractelement <4 x float> %tmp18, i32 1
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index 37c0e0f304ce..4328e964c1bf 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll
@@ -1,14 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}ngroups_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].X
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
@@ -21,8 +22,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[0].Y
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
@@ -35,8 +37,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[0].Z
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
@@ -49,8 +52,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[0].W
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
@@ -63,8 +67,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[1].X
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
@@ -77,8 +82,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[1].Y
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
@@ -91,8 +97,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[1].Z
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @local_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
@@ -105,8 +112,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[1].W
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @local_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
@@ -119,8 +127,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[2].X
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @local_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
@@ -133,8 +142,9 @@ entry:
 ; EG: MOV [[VAL]], KC0[2].Z
 
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VVAL]]
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
 define void @get_work_dim (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.AMDGPU.read.workdim() #0
@@ -147,8 +157,8 @@ entry:
 ; kernel arguments, but this may change in the future.
 
 ; FUNC-LABEL: {{^}}tgid_x:
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
-; SI: buffer_store_dword [[VVAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
+; GCN: buffer_store_dword [[VVAL]]
 define void @tgid_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
@@ -157,8 +167,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_y:
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
-; SI: buffer_store_dword [[VVAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
+; GCN: buffer_store_dword [[VVAL]]
 define void @tgid_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
@@ -167,8 +177,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_z:
-; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
-; SI: buffer_store_dword [[VVAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
+; GCN: buffer_store_dword [[VVAL]]
 define void @tgid_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
@@ -177,7 +187,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tidig_x:
-; SI: buffer_store_dword v0
+; GCN: buffer_store_dword v0
 define void @tidig_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
@@ -186,7 +196,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tidig_y:
-; SI: buffer_store_dword v1
+; GCN: buffer_store_dword v1
 define void @tidig_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
@@ -195,7 +205,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tidig_z:
-; SI: buffer_store_dword v2
+; GCN: buffer_store_dword v2
 define void @tidig_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
diff --git a/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
index 4e77c07c0ea1..5ab465338e15 100644
--- a/test/CodeGen/R600/wrong-transalu-pos-fix.ll
+++ b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
@@ -35,7 +35,7 @@ entry:
   %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1
   %add.i = add i32 %z.i8.i, %mul33.i
   %add13 = add i32 %add.i, %add
-  %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %add13
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add13
   store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index b43ff4006473..089db59eabc7 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -11,8 +11,8 @@
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
-  %a = load <2 x i32> addrspace(1) * %in0
-  %b = load <2 x i32> addrspace(1) * %in1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
   %result = xor <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -30,8 +30,8 @@ define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
-  %a = load <4 x i32> addrspace(1) * %in0
-  %b = load <4 x i32> addrspace(1) * %in1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
   %result = xor <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -40,15 +40,15 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 ; FUNC-LABEL: {{^}}xor_i1:
 ; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
 
-; SI-DAG: v_cmp_ge_f32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 0
-; SI-DAG: v_cmp_ge_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 1.0
+; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
+; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
 ; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
 define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
-  %a = load float addrspace(1) * %in0
-  %b = load float addrspace(1) * %in1
+  %a = load float, float addrspace(1) * %in0
+  %b = load float, float addrspace(1) * %in1
   %acmp = fcmp oge float %a, 0.000000e+00
   %bcmp = fcmp oge float %b, 1.000000e+00
   %xor = xor i1 %acmp, %bcmp
@@ -58,14 +58,14 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
 }
 
 ; FUNC-LABEL: {{^}}v_xor_i1:
-; SI: buffer_load_ubyte [[A:v[0-9]+]]
 ; SI: buffer_load_ubyte [[B:v[0-9]+]]
+; SI: buffer_load_ubyte [[A:v[0-9]+]]
 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
 define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
-  %a = load i1 addrspace(1)* %in0
-  %b = load i1 addrspace(1)* %in1
+  %a = load i1, i1 addrspace(1)* %in0
+  %b = load i1, i1 addrspace(1)* %in1
   %xor = xor i1 %a, %b
   store i1 %xor, i1 addrspace(1)* %out
   ret void
@@ -74,8 +74,8 @@ define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace
 ; FUNC-LABEL: {{^}}vector_xor_i32:
 ; SI: v_xor_b32_e32
 define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
-  %a = load i32 addrspace(1)* %in0
-  %b = load i32 addrspace(1)* %in1
+  %a = load i32, i32 addrspace(1)* %in0
+  %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -100,8 +100,8 @@ define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
 ; FUNC-LABEL: {{^}}vector_not_i32:
 ; SI: v_not_b32
 define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
-  %a = load i32 addrspace(1)* %in0
-  %b = load i32 addrspace(1)* %in1
+  %a = load i32, i32 addrspace(1)* %in0
+  %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, -1
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -112,8 +112,8 @@ define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
 ; SI: v_xor_b32_e32
 ; SI: s_endpgm
 define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
-  %a = load i64 addrspace(1)* %in0
-  %b = load i64 addrspace(1)* %in1
+  %a = load i64, i64 addrspace(1)* %in0
+  %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -140,8 +140,8 @@ define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI: v_not_b32
 ; SI: v_not_b32
 define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
-  %a = load i64 addrspace(1)* %in0
-  %b = load i64 addrspace(1)* %in1
+  %a = load i64, i64 addrspace(1)* %in0
+  %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, -1
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -163,7 +163,7 @@ if:
   br label %endif
 
 else:
-  %2 = load i64 addrspace(1)* %in
+  %2 = load i64, i64 addrspace(1)* %in
   br label %endif
 
 endif:
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index d052ee64c6f1..033055db185a 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -30,9 +30,9 @@ entry:
 }
 
 ; SI-LABEL: {{^}}zext_i1_to_i64:
+; SI: s_mov_b32 s{{[0-9]+}}, 0
 ; SI: v_cmp_eq_i32
 ; SI: v_cndmask_b32
-; SI: s_mov_b32 s{{[0-9]+}}, 0
 define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64