761 files changed, 29915 insertions, 12518 deletions
diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index edad18e244d0..ca661cf9a712 100644
--- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -13,7 +13,7 @@
 ; FUNC-LABEL: {{^}}local_address_load:
 ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
-define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32, i32 addrspace(3)* %in
   store i32 %0, i32 addrspace(1)* %out
@@ -24,7 +24,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]]
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
   %1 = load i32, i32 addrspace(3)* %0
@@ -35,7 +35,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_address_gep_const_offset:
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
-define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
   %1 = load i32, i32 addrspace(3)* %0
@@ -48,7 +48,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
   %1 = load i32, i32 addrspace(3)* %0
@@ -60,7 +60,7 @@ entry:
 ; SI: v_cmp_ne_u32
 ; SI-NOT: v_cmp_ne_u32
 ; SI: v_cndmask_b32
-define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
   %cmp = icmp ne i32 addrspace(3)* %lds, null
   %x = select i1 %cmp, i32 123, i32 456
   store i32 %x, i32 addrspace(1)* %out
@@ -71,7 +71,7 @@ define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds)
 ; SI: s_mul_i32
 ; SI-NEXT: s_add_i32
 ; SI: ds_read_b32
-define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
   %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
   %val = load float, float addrspace(3)* %ptr
   store float %val, float addrspace(1)* %out
@@ -83,7 +83,7 @@ define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %
 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
-define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float, float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
   ret void
@@ -95,14 +95,14 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti
 
 ; FUNC-LABEL: {{^}}global_ptr:
 ; SI: ds_write_b32
-define void @global_ptr() nounwind {
+define amdgpu_kernel void @global_ptr() nounwind {
   store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_address_store:
 ; SI: ds_write_b32
-define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
   store i32 %val, i32 addrspace(3)* %out
   ret void
 }
@@ -111,7 +111,7 @@ define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
 ; SI: s_add_i32 [[SADDR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
 ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
-define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
+define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -121,7 +121,7 @@ define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
-define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -132,7 +132,7 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
-define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
new file mode 100644
index 000000000000..56a9e7022db9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -0,0 +1,28 @@
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN
+
+# REQUIRES: global-isel
+
+--- |
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+...
+---
+
+name:            global_addrspace
+legalized:       true
+regBankSelected: true
+
+# GCN: global_addrspace
+# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
+# GCN: FLAT_LOAD_DWORD  [[PTR]], 0, 0, 0
+
+body: |
+  bb.0:
+    liveins:  %vgpr0_vgpr1
+
+    %0:vgpr(p1) = COPY %vgpr0_vgpr1
+    %1:vgpr(s32) = G_LOAD %0 :: (load 4 from %ir.global0)
+    %vgpr0 = COPY %1
+
+...
+---
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
new file mode 100644
index 000000000000..ea2ad2ba83a5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -0,0 +1,142 @@
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,SI,SICI,SIVI
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,CI,SICI
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,VI,SIVI
+
+# REQUIRES: global-isel
+
+--- |
+  define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
+...
+---
+
+name:            smrd_imm
+legalized:       true
+regBankSelected: true
+
+# GCN: body:
+# GCN: [[PTR:%[0-9]+]] = COPY %sgpr0_sgpr1
+
+# Immediate offset:
+# SICI: S_LOAD_DWORD_IMM [[PTR]], 1, 0
+# VI:   S_LOAD_DWORD_IMM [[PTR]], 4, 0 
+
+# Max immediate offset for SI
+# SICI: S_LOAD_DWORD_IMM [[PTR]], 255, 0
+# VI:   S_LOAD_DWORD_IMM [[PTR]], 1020, 0
+
+# Immediate overflow for SI
+# SI: [[K1024:%[0-9]+]] = S_MOV_B32 1024
+# SI: S_LOAD_DWORD_SGPR [[PTR]], [[K1024]], 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 256, 0
+# VI: S_LOAD_DWORD_IMM [[PTR]], 1024, 0
+
+# Max immediate offset for VI
+# SI: [[K1048572:%[0-9]+]] = S_MOV_B32 1048572
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262143
+# VI: S_LOAD_DWORD_IMM [[PTR]], 1048572
+
+#
+# Immediate overflow for VI
+# SIVI: [[K1048576:%[0-9]+]] = S_MOV_B32 1048576
+# SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K1048576]], 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262144, 0
+
+# Max immediate for CI
+# SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 4294967292
+# SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 3
+# SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# SIVI-DAG: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
+# SIVI-DAG: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
+# SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
+# SIVI-DAG: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
+# SIVI-DAG: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
+# SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
+# SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 4294967295, 0
+
+# Immediate overflow for CI
+# GCN: [[K_LO:%[0-9]+]] = S_MOV_B32 0
+# GCN: [[K_HI:%[0-9]+]] = S_MOV_B32 4
+# GCN: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# GCN-DAG: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
+# GCN-DAG: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
+# GCN: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
+# GCN-DAG: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
+# GCN-DAG: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
+# GCN: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
+# GCN: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# GCN: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
+
+# Max 32-bit byte offset
+# SIVI: [[K4294967292:%[0-9]+]] = S_MOV_B32 4294967292
+# SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K4294967292]], 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741823, 0
+
+# Overflow 32-bit byte offset
+# SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 0
+# SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 1
+# SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# SIVI-DAG: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
+# SIVI-DAG: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
+# SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
+# SIVI-DAG: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
+# SIVI-DAG: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
+# SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
+# SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741824, 0
+
+body: |
+  bb.0:
+    liveins: %sgpr0_sgpr1
+   
+    %0:sgpr(p2) = COPY %sgpr0_sgpr1
+
+    %1:sgpr(s64) = G_CONSTANT i64 4
+    %2:sgpr(p2) = G_GEP %0, %1
+    %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %3
+
+    %4:sgpr(s64) = G_CONSTANT i64 1020
+    %5:sgpr(p2) = G_GEP %0, %4
+    %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %6
+
+    %7:sgpr(s64) = G_CONSTANT i64 1024
+    %8:sgpr(p2) = G_GEP %0, %7
+    %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %9
+
+    %10:sgpr(s64) = G_CONSTANT i64 1048572
+    %11:sgpr(p2) = G_GEP %0, %10
+    %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %12
+
+    %13:sgpr(s64) = G_CONSTANT i64 1048576
+    %14:sgpr(p2) = G_GEP %0, %13
+    %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %15
+
+    %16:sgpr(s64) = G_CONSTANT i64 17179869180
+    %17:sgpr(p2) = G_GEP %0, %16
+    %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %18
+
+    %19:sgpr(s64) = G_CONSTANT i64 17179869184
+    %20:sgpr(p2) = G_GEP %0, %19
+    %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %21
+
+    %22:sgpr(s64) = G_CONSTANT i64 4294967292
+    %23:sgpr(p2) = G_GEP %0, %22
+    %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %24
+
+    %25:sgpr(s64) = G_CONSTANT i64 4294967296
+    %26:sgpr(p2) = G_GEP %0, %25
+    %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %27
+
+...
+---
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
new file mode 100644
index 000000000000..ea435725bf25
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -0,0 +1,29 @@
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN
+
+# REQUIRES: global-isel
+
+--- |
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+...
+---
+
+name:            global_addrspace
+legalized:       true
+regBankSelected: true
+
+# GCN: global_addrspace
+# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
+# GCN: [[VAL:%[0-9]+]] = COPY %vgpr2
+# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0, 0
+
+body: |
+  bb.0:
+    liveins:  %vgpr0_vgpr1, %vgpr2
+
+    %0:vgpr(p1) = COPY %vgpr0_vgpr1
+    %1:vgpr(s32) = COPY %vgpr2
+    G_STORE %1, %0 :: (store 4 into %ir.global0)
+
+...
+---
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
new file mode 100644
index 000000000000..3496b1ab71fe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
@@ -0,0 +1,69 @@
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=regbankselect -global-isel %s -verify-machineinstrs -o - | FileCheck %s
+
+# REQUIRES: global-isel
+
+--- |
+  define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
+  define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {
+    %tmp0 = load i32, i32 addrspace(1)* %ptr1
+    ret void
+  }
+  define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
+    %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+    %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0
+    %tmp2 = load i32, i32 addrspace(1)* %tmp1
+    ret void
+  }
+  declare i32 @llvm.amdgcn.workitem.id.x() #0
+  attributes #0 = { nounwind readnone }
+...
+
+---
+name : load_constant
+legalized: true
+
+# CHECK-LABEL: name: load_constant
+# CHECK: registers:
+# CHECK: - { id: 0, class: sgpr }
+# CHECK: - { id: 1, class: sgpr }
+
+body: |
+  bb.0:
+    liveins: %sgpr0_sgpr1
+    %0:_(p2) = COPY %sgpr0_sgpr1
+    %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0)
+...
+
+---
+name: load_global_uniform
+legalized: true
+
+# CHECK-LABEL: name: load_global_uniform
+# CHECK: registers:
+# CHECK: - { id: 0, class: sgpr }
+# CHECK: - { id: 1, class: sgpr }
+
+body: |
+  bb.0:
+    liveins: %sgpr0_sgpr1
+    %0:_(p1) = COPY %sgpr0_sgpr1
+    %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr1)
+...
+
+---
+name: load_global_non_uniform
+legalized: true
+
+# CHECK-LABEL: name: load_global_non_uniform
+# CHECK: registers:
+# CHECK: - { id: 0, class: sgpr }
+# CHECK: - { id: 1, class: vgpr }
+# CHECK: - { id: 2, class: vgpr }
+
+
+body: |
+  bb.0:
+    liveins: %sgpr0_sgpr1
+    %0:_(p1) = COPY %sgpr0_sgpr1
+    %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.tmp1)
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll b/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll
new file mode 100644
index 000000000000..a1bf987e6552
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=GCN %s
+
+; REQUIRES: global-isel
+
+; GCN-LABEL: vs_epilog
+; GCN: s_endpgm
+
+define amdgpu_vs void @vs_epilog() {
+main_body:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
new file mode 100644
index 000000000000..8a6b3df9cff8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -0,0 +1,89 @@
+; FIXME: Need to add support for mubuf stores to enable this on SI.
+; XUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s
+; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=CI --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s
+
+; REQUIRES: global-isel
+
+; SMRD load with an immediate offset.
+; GCN-LABEL: {{^}}smrd0:
+; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with the largest possible immediate offset.
+; GCN-LABEL: {{^}}smrd1:
+; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with an offset greater than the largest possible immediate.
+; GCN-LABEL: {{^}}smrd2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; GCN: s_endpgm
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with a 64-bit offset
+; GCN-LABEL: {{^}}smrd3:
+; FIXME: There are too many copies here because we don't fold immediates
+;        through REG_SEQUENCE
+; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
+; TODO: Add VI checks
+; XGCN: s_endpgm
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with the largest possible immediate offset on VI
+; GCN-LABEL: {{^}}smrd4:
+; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with an offset greater than the largest possible immediate on VI
+; GCN-LABEL: {{^}}smrd5:
+; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
+; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
+; GCN: s_endpgm
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/add-debug.ll b/test/CodeGen/AMDGPU/add-debug.ll
index 529905dd36a2..b90c20b97482 100644
--- a/test/CodeGen/AMDGPU/add-debug.ll
+++ b/test/CodeGen/AMDGPU/add-debug.ll
@@ -3,7 +3,7 @@
 ; REQUIRES: asserts
 
 ; Check that SelectionDAGDumper does not crash on int_SI_if.
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll
index 6c5cdd3877d1..3b274c9d2027 100644
--- a/test/CodeGen/AMDGPU/add.i16.ll
+++ b/test/CodeGen/AMDGPU/add.i16.ll
@@ -6,7 +6,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -38,7 +38,7 @@ define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -53,7 +53,7 @@ define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -69,7 +69,7 @@ define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -84,12 +84,12 @@ define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
+; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
-; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -109,7 +109,7 @@ define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -130,7 +130,7 @@ define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
index a6247c735240..7e4546d2cfb3 100644
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -8,7 +8,7 @@
 ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
 ;SI-NOT: [[REG]]
 ;SI: buffer_store_dword [[REG]],
-define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -24,7 +24,7 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -44,7 +44,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -71,7 +71,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
   %0 = add <8 x i32> %a, %b
   store <8 x i32> %0, <8 x i32> addrspace(1)* %out
@@ -112,7 +112,7 @@ entry:
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
   %0 = add <16 x i32> %a, %b
   store <16 x i32> %0, <16 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = add i64 %a, %b
   store i64 %0, i64 addrspace(1)* %out
@@ -150,7 +150,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64, i64 addrspace(1)* %in
   %1 = add i64 %a, %0
@@ -169,7 +169,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll
new file mode 100644
index 000000000000..e137ef4bc236
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -0,0 +1,283 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16:
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_add_v2i16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
+
+; VI: s_add_i32
+; VI: s_add_i32
+define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+  %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
+  %add = add <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_add_self_v2i16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL]], [[VAL]]
+
+; VI: s_add_i32
+; VI: s_add_i32
+define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+  %add = add <2 x i16> %a, %a
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: VI should not scalarize arg access.
+; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+
+; VI: v_add_i32
+; VI: v_add_i32_sdwa
+define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+  %add = add <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
+; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 123, i16 456>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant:
+; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 -845, i16 -991>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
+
+; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
+; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD0]]
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 -1, i16 -1>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+
+; VI-NOT: v_add_u16
+; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
+; VI-NOT: v_add_u16
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 32, i16 0>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; The high element gives fp
+; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split:
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+
+; VI-NOT: v_add_u16
+; VI: v_add_u16_e32 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}}
+; VI-NOT: v_add_u16
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 0, i16 16256>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i32:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
+; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+
+; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
+; VI-NOT: and
+; VI-NOT: shl
+; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; VI-NOT: and
+; VI-NOT: shl
+; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  %ext = zext <2 x i16> %add to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
+; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx4
+
+; VI: flat_load_ushort v[[A_LO:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+
+; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI: v_add_u16_e32
+; VI: v_add_u16_e32
+
+; VI: buffer_store_dwordx4
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  %ext = zext <2 x i16> %add to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i32:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16
+; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+
+; VI: v_add_u16_e32
+; VI: v_add_u16_e32
+; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; VI: buffer_store_dwordx2
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  %ext = sext <2 x i16> %add to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i64:
+; GCN: flat_load_dword
+; GCN: flat_load_dword
+
+; GFX9: v_pk_add_u16
+; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+
+; VI: v_add_u16_sdwa
+; VI: v_add_u16_e32
+
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  %ext = sext <2 x i16> %add to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/add_i128.ll b/test/CodeGen/AMDGPU/add_i128.ll
index c80157ca9c58..00a125c2e44f 100644
--- a/test/CodeGen/AMDGPU/add_i128.ll
+++ b/test/CodeGen/AMDGPU/add_i128.ll
@@ -6,7 +6,7 @@
 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]],
-define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid
@@ -23,7 +23,7 @@ define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)*
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %foo, %a
   store i128 %result, i128 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* no
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %a, %foo
   store i128 %result, i128 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspa
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
-define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
+define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
   %result = add i128 %a, %b
   store i128 %result, i128 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll
index 3d360b7d0b7a..62733d5bfb6c 100644
--- a/test/CodeGen/AMDGPU/add_i64.ll
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; SI-LABEL: {{^}}test_i64_vreg:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@@ -21,7 +21,7 @@ define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
 ; SI-LABEL: {{^}}sgpr_operand:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %foo, %a
   store i64 %result, i64 addrspace(1)* %out
@@ -34,7 +34,7 @@ define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noal
 ; SI-LABEL: {{^}}sgpr_operand_reversed:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %a, %foo
   store i64 %result, i64 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace
 ; SI: s_addc_u32
 ; SI: s_add_u32
 ; SI: s_addc_u32
-define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@ define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a,
 ; SI: v_addc_u32
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@@ -76,7 +76,7 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add
 ; SI-NOT: addc
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/addrspacecast-captured.ll b/test/CodeGen/AMDGPU/addrspacecast-captured.ll
new file mode 100644
index 000000000000..138bc36b9e1b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/addrspacecast-captured.ll
@@ -0,0 +1,47 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+; Nothing should be done if the addrspacecast is captured.
+
+declare void @consume_ptr2int(i32) #0
+
+; CHECK-LABEL: @addrspacecast_captured(
+; CHECK: %data = alloca i32, align 4
+; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
+; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
+; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out
+define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
+entry:
+  %data = alloca i32, align 4
+  %cast = addrspacecast i32* %data to i32 addrspace(4)*
+  %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
+  store i32 %ptr2int, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @addrspacecast_captured_store(
+; CHECK: %data = alloca i32, align 4
+; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
+; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
+define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
+entry:
+  %data = alloca i32, align 4
+  %cast = addrspacecast i32* %data to i32 addrspace(4)*
+  store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @addrspacecast_captured_call(
+; CHECK: %data = alloca i32, align 4
+; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
+; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
+; CHECK: call void @consume_ptr2int(i32 %ptr2int)
+define amdgpu_kernel void @addrspacecast_captured_call() #0 {
+entry:
+  %data = alloca i32, align 4
+  %cast = addrspacecast i32* %data to i32 addrspace(4)*
+  %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
+  call void @consume_ptr2int(i32 %ptr2int)
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 67a193999204..8cabc7dae133 100644
--- a/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -9,57 +9,57 @@ declare void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* nocapture, i32 addrs
 @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
 
 ; HSA: @store_cast_0_flat_to_group_addrspacecast() #1
-define void @store_cast_0_flat_to_group_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
   ret void
 }
 
 ; HSA: @store_cast_0_group_to_flat_addrspacecast() #2
-define void @store_cast_0_group_to_flat_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*)
   ret void
 }
 
-; HSA: define void @store_constant_cast_group_gv_to_flat() #2
-define void @store_constant_cast_group_gv_to_flat() #1 {
+; HSA: define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #2
+define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_group_gv_gep_to_flat() #2
-define void @store_constant_cast_group_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_to_flat() #1
-define void @store_constant_cast_global_gv_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_gep_to_flat() #1
-define void @store_constant_cast_global_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst
   %val0 = extractvalue { i32, i1 } %val, 0
   store i32 %val0, i32 addrspace(1)* %out
@@ -67,28 +67,28 @@ define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out)
 }
 
 ; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false)
   ret void
 }
 
 ; Can't just search the pointer value
 ; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2
-define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
   store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out
   ret void
 }
 
 ; Can't just search pointer types
 ; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2
-define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
   store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out
   ret void
 }
 
 ; Cast group to flat, do GEP, cast back to group
 ; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2
-define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index 0a2130c96add..6ec93c72ec52 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -1,14 +1,23 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=GFX9 %s
 
 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
 ; HSA: enable_sgpr_private_segment_buffer = 1
 ; HSA: enable_sgpr_dispatch_ptr = 0
-; HSA: enable_sgpr_queue_ptr = 1
+; CI: enable_sgpr_queue_ptr = 1
+; GFX9: enable_sgpr_queue_ptr = 0
 
-; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
-; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
+; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16)
+; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
+; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
+
+; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
 
-; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
 ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 
 ; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
@@ -17,7 +26,13 @@
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+
+; At most 2 digits. Make sure src_shared_base is not counted as a high
+; number SGPR.
+
+; CI: NumSgprs: {{[0-9][0-9]+}}
+; GFX9: NumSgprs: {{[0-9]+}}
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -26,21 +41,32 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
 ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
 ; HSA: enable_sgpr_private_segment_buffer = 1
 ; HSA: enable_sgpr_dispatch_ptr = 0
-; HSA: enable_sgpr_queue_ptr = 1
+; CI: enable_sgpr_queue_ptr = 1
+; GFX9: enable_sgpr_queue_ptr = 0
 
-; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
-; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
+; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16)
+; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
+; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]]
+
+; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
 
-; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
 ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 
-; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
 ; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
 ; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
+
+; CI: NumSgprs: {{[0-9][0-9]+}}
+; GFX9: NumSgprs: {{[0-9]+}}
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -55,7 +81,7 @@ define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -67,7 +93,7 @@ define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
@@ -84,7 +110,7 @@ define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
@@ -98,10 +124,10 @@ define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
 ; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
-; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
@@ -115,7 +141,7 @@ define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
@@ -126,21 +152,27 @@ define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   load volatile i32, i32 addrspace(2)* %ftos
   ret void
 }
 
 ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
-; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
-; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
+; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16)
+; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
+; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
+
+; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
+
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
-  store i32 7, i32 addrspace(4)* %cast
+  store volatile i32 7, i32 addrspace(4)* %cast
   ret void
 }
 
@@ -148,9 +180,9 @@ define void @cast_0_group_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_0_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
-  store i32 7, i32 addrspace(3)* %cast
+  store volatile i32 7, i32 addrspace(3)* %cast
   ret void
 }
 
@@ -159,9 +191,9 @@ define void @cast_0_flat_to_group_addrspacecast() #0 {
 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_neg1_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
-  store i32 7, i32 addrspace(4)* %cast
+  store volatile i32 7, i32 addrspace(4)* %cast
   ret void
 }
 
@@ -169,31 +201,34 @@ define void @cast_neg1_group_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_neg1_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
-  store i32 7, i32 addrspace(3)* %cast
+  store volatile i32 7, i32 addrspace(3)* %cast
   ret void
 }
 
+; FIXME: Shouldn't need to enable queue ptr
 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
-; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
-; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; CI: enable_sgpr_queue_ptr = 1
+; GFX9: enable_sgpr_queue_ptr = 0
+
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_private_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32* null to i32 addrspace(4)*
-  store i32 7, i32 addrspace(4)* %cast
+  store volatile i32 7, i32 addrspace(4)* %cast
   ret void
 }
 
 ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-define void @cast_0_flat_to_private_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
-  store i32 7, i32* %cast
+  store volatile i32 7, i32* %cast
   ret void
 }
 
@@ -203,7 +238,7 @@ define void @cast_0_flat_to_private_addrspacecast() #0 {
 ; HSA-LABEL: {{^}}branch_use_flat_i32:
 ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
 ; HSA: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
 entry:
   %cmp = icmp ne i32 %c, 0
   br i1 %cmp, label %local, label %global
@@ -218,7 +253,7 @@ global:
 
 end:
   %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
-  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  store volatile i32 %x, i32 addrspace(4)* %fptr, align 4
 ;  %val = load i32, i32 addrspace(4)* %fptr, align 4
 ;  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -226,22 +261,26 @@ end:
 
 ; Check for prologue initializing special SGPRs pointing to scratch.
 ; HSA-LABEL: {{^}}store_flat_scratch:
-; HSA-DAG: s_mov_b32 flat_scratch_lo, s9
-; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
-; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
+; CI-DAG: s_mov_b32 flat_scratch_lo, s9
+; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
+; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
+
+; GFX9: s_add_u32 flat_scratch_lo, s6, s9
+; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
+
 ; HSA: flat_store_dword
 ; HSA: s_barrier
 ; HSA: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
   %alloca = alloca i32, i32 9, align 4
   %x = call i32 @llvm.amdgcn.workitem.id.x() #2
   %pptr = getelementptr i32, i32* %alloca, i32 %x
   %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
-  store i32 %x, i32 addrspace(4)* %fptr
+  store volatile i32 %x, i32 addrspace(4)* %fptr
   ; Dummy call
   call void @llvm.amdgcn.s.barrier() #1
-  %reload = load i32, i32 addrspace(4)* %fptr, align 4
-  store i32 %reload, i32 addrspace(1)* %out, align 4
+  %reload = load volatile i32, i32 addrspace(4)* %fptr, align 4
+  store volatile i32 %reload, i32 addrspace(1)* %out, align 4
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 87ef5978ebfc..ef742f56faec 100644
--- a/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -3,24 +3,20 @@
 
 ; This test just checks that the compiler doesn't crash.
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 ; FUNC-LABEL: {{^}}v32i8_to_v8i32:
-; SI: s_endpgm
-define amdgpu_ps void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
   %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
   %2 = bitcast <32 x i8> %1 to <8 x i32>
   %3 = extractelement <8 x i32> %2, i32 1
   %4 = icmp ne i32 %3, 0
   %5 = select i1 %4, float 0.0, float 1.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
-  ret void
+  ret float %5
 }
 
 ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
 ; SI: s_endpgm
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
   %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@@ -28,28 +24,50 @@ entry:
   ret void
 }
 
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
-  %bc = bitcast float %load to <2 x i16>
-  store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
+  %fadd32 = fadd float %load, 1.0
+  %bc = bitcast float %fadd32 to <2 x i16>
+  %add.bitcast = add <2 x i16> %bc, <i16 2, i16 2>
+  store <2 x i16> %add.bitcast, <2 x i16> addrspace(1)* %out
   ret void
 }
 
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
-  %bc = bitcast <2 x i16> %load to float
-  store float %bc, float addrspace(1)* %out, align 4
+  %add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
+  %bc = bitcast <2 x i16> %add.v2i16 to float
+  %fadd.bitcast = fadd float %bc, 1.0
+  store float %fadd.bitcast, float addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %load = load float, float addrspace(1)* %in, align 4
+  %fadd32 = fadd float %load, 1.0
+  %bc = bitcast float %fadd32 to <2 x half>
+  %add.bitcast = fadd <2 x half> %bc, <half 2.0, half 2.0>
+  store <2 x half> %add.bitcast, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
+  %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4
+  %add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
+  %bc = bitcast <2 x half> %add.v2f16 to float
+  %fadd.bitcast = fadd float %bc, 1.0
+  store float %fadd.bitcast, float addrspace(1)* %out
   ret void
 }
 
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -58,17 +76,18 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou
 
 ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
 ; SI: s_endpgm
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %add = add <2 x i32> %val, <i32 4, i32 9>
   %bc = bitcast <2 x i32> %add to double
-  store double %bc, double addrspace(1)* %out, align 8
+  %fadd.bc = fadd double %bc, 1.0
+  store double %fadd.bc, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
 ; SI: s_endpgm
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -77,7 +96,7 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -93,7 +112,7 @@ end:
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
diff --git a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
index a6d055891d4b..79450b97c218 100644
--- a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -15,7 +15,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; GCN-NOT: v0
 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll
deleted file mode 100644
index 8d8885852afe..000000000000
--- a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}main:
-; GCN: s_mov_b32 m0, s0
-; VI-NEXT: s_nop 0
-; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
-; GCN-NEXT: s_endpgm
-
-define amdgpu_gs void @main(i32 inreg %a) #0 {
-  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a)
-  ret void
-}
-
-; GCN-LABEL: {{^}}main_halt:
-; GCN: s_mov_b32 m0, s0
-; VI-NEXT: s_nop 0
-; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT)
-; GCN-NEXT: s_endpgm
-
-define  void @main_halt(i32 inreg %a) #0 {
-  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a)
-  ret void
-}
-
-; GCN-LABEL: {{^}}legacy:
-; GCN: s_mov_b32 m0, s0
-; VI-NEXT: s_nop 0
-; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
-; GCN-NEXT: s_endpgm
-
-define amdgpu_gs void @legacy(i32 inreg %a) #0 {
-  call void @llvm.SI.sendmsg(i32 3, i32 %a)
-  ret void
-}
-
-declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
-declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
-declare void @llvm.SI.sendmsg(i32, i32) #0
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll
deleted file mode 100644
index 31f9cfca6def..000000000000
--- a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll
+++ /dev/null
@@ -1,161 +0,0 @@
-;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-LABEL: {{^}}test_interrupt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT)
-define void @test_interrupt() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0);
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_emit:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_gs_emit() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0);
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_cut:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_gs_cut() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0);
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_emit_cut:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_gs_emit_cut() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_done:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_gs_done() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
-  ret void
-}
-
-
-; CHECK-LABEL: {{^}}test_interrupt_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_INTERRUPT)
-define void @test_interrupt_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_emit_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_gs_emit_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_cut_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_gs_cut_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_emit_cut_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_gs_emit_cut_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_done_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_gs_done_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0)
-  ret void
-}
-
-; Legacy
-; CHECK-LABEL: {{^}}test_legacy_interrupt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT)
-define void @test_legacy_interrupt() {
-body:
-  call void @llvm.SI.sendmsg(i32 1, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_legacy_gs_emit:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_legacy_gs_emit() {
-body:
-  call void @llvm.SI.sendmsg(i32 34, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_legacy_gs_cut:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_legacy_gs_cut() {
-body:
-  call void @llvm.SI.sendmsg(i32 274, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_legacy_gs_emit_cut:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_legacy_gs_emit_cut() {
-body:
-  call void @llvm.SI.sendmsg(i32 562, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_legacy_gs_done:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_legacy_gs_done() {
-body:
-  call void @llvm.SI.sendmsg(i32 3, i32 0)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
-declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
-declare void @llvm.SI.sendmsg(i32, i32) #0
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
new file mode 100644
index 000000000000..e68ed9cac93f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
@@ -0,0 +1,9 @@
+; RUN: opt -mtriple=amdgcn-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: NoAlias:      i8 addrspace(1)* %p1, i8* %p
+
+define void @test(i8* %p, i8 addrspace(1)* %p1) {
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index d78c75165be2..0e5605961e10 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -4,7 +4,7 @@
 
 ; NOOP-LABEL: @noop_fdiv_fpmath(
 ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
-define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
   %md.25ulp = fdiv float %a, %b, !fpmath !0
   store volatile float %md.25ulp, float addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -51,7 +51,7 @@ define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
 ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
 ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
-define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
   %no.md = fdiv float 1.0, %x
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -89,7 +89,7 @@ define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
-define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
   %no.md = fdiv <2 x float> %a, %b
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -120,7 +120,7 @@ define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a,
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -158,7 +158,7 @@ define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float>
 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -186,7 +186,7 @@ define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
 
   %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
@@ -206,7 +206,7 @@ define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
index 13e4192ccd72..95a206e1dd00 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -6,7 +6,7 @@
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @add_i3(i3 %a, i3 %b) {
@@ -19,7 +19,7 @@ define i3 @add_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @add_nsw_i3(i3 %a, i3 %b) {
@@ -32,7 +32,7 @@ define i3 @add_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @add_nuw_i3(i3 %a, i3 %b) {
@@ -58,7 +58,7 @@ define i3 @add_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @sub_i3(i3 %a, i3 %b) {
@@ -84,7 +84,7 @@ define i3 @sub_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @sub_nuw_i3(i3 %a, i3 %b) {
@@ -110,7 +110,7 @@ define i3 @sub_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @mul_i3(i3 %a, i3 %b) {
@@ -123,7 +123,7 @@ define i3 @mul_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @mul_nsw_i3(i3 %a, i3 %b) {
@@ -136,7 +136,7 @@ define i3 @mul_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @mul_nuw_i3(i3 %a, i3 %b) {
@@ -188,7 +188,7 @@ define i3 @srem_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @shl_i3(i3 %a, i3 %b) {
@@ -201,7 +201,7 @@ define i3 @shl_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @shl_nsw_i3(i3 %a, i3 %b) {
@@ -214,7 +214,7 @@ define i3 @shl_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @shl_nuw_i3(i3 %a, i3 %b) {
@@ -525,7 +525,7 @@ define i3 @bitreverse_i3(i3 %a) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @add_i16(i16 %a, i16 %b) {
@@ -559,7 +559,7 @@ define i16 @constant_add_nuw_i16() {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @add_nsw_i16(i16 %a, i16 %b) {
@@ -572,7 +572,7 @@ define i16 @add_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @add_nuw_i16(i16 %a, i16 %b) {
@@ -598,7 +598,7 @@ define i16 @add_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @sub_i16(i16 %a, i16 %b) {
@@ -624,7 +624,7 @@ define i16 @sub_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @sub_nuw_i16(i16 %a, i16 %b) {
@@ -650,7 +650,7 @@ define i16 @sub_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @mul_i16(i16 %a, i16 %b) {
@@ -663,7 +663,7 @@ define i16 @mul_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @mul_nsw_i16(i16 %a, i16 %b) {
@@ -676,7 +676,7 @@ define i16 @mul_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @mul_nuw_i16(i16 %a, i16 %b) {
@@ -728,7 +728,7 @@ define i16 @srem_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @shl_i16(i16 %a, i16 %b) {
@@ -741,7 +741,7 @@ define i16 @shl_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @shl_nsw_i16(i16 %a, i16 %b) {
@@ -754,7 +754,7 @@ define i16 @shl_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @shl_nuw_i16(i16 %a, i16 %b) {
@@ -1072,7 +1072,7 @@ define i16 @bitreverse_i16(i16 %a) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1085,7 +1085,7 @@ define <3 x i15> @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1098,7 +1098,7 @@ define <3 x i15> @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1124,7 +1124,7 @@ define <3 x i15> @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = sub <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1150,7 +1150,7 @@ define <3 x i15> @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1176,7 +1176,7 @@ define <3 x i15> @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1189,7 +1189,7 @@ define <3 x i15> @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1202,7 +1202,7 @@ define <3 x i15> @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1254,7 +1254,7 @@ define <3 x i15> @srem_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1267,7 +1267,7 @@ define <3 x i15> @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1280,7 +1280,7 @@ define <3 x i15> @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1591,7 +1591,7 @@ define <3 x i15> @bitreverse_3xi15(<3 x i15> %a) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1604,7 +1604,7 @@ define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1617,7 +1617,7 @@ define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1643,7 +1643,7 @@ define <3 x i16> @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = sub <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1669,7 +1669,7 @@ define <3 x i16> @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1695,7 +1695,7 @@ define <3 x i16> @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1708,7 +1708,7 @@ define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1721,7 +1721,7 @@ define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1773,7 +1773,7 @@ define <3 x i16> @srem_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1786,7 +1786,7 @@ define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1799,7 +1799,7 @@ define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
diff --git a/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
index dd16907b748c..0ba8836b20dc 100644
--- a/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
@@ -13,9 +13,10 @@ define amdgpu_cs float @shader_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w
 
 ; GCN-LABEL: {{^}}kernel_cc:
 ; GCN: s_endpgm
-define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+define amdgpu_kernel void @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
   %vi = bitcast float %v to i32
   %x = add i32 %vi, %w
   %xf = bitcast i32 %x to float
-  ret float %xf
+  store float %xf, float addrspace(1)* undef
+  ret void
 }
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 1511e1343808..1f4b1eaa209a 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -1,9 +1,9 @@
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
 
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s
 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s
@@ -27,8 +27,6 @@
 ; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
 ; HSA-PROMOTE: .end_amd_kernel_code_t
 
-; FIXME: These should be merged
-; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x1
 ; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2
 
 ; SI-PROMOTE: ds_write_b32
@@ -58,9 +56,9 @@
 ; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0
 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16
 
-; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
-; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !1
-; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !1
+; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !2
+; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !2
 
 ; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]]
 ; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]]
@@ -77,10 +75,10 @@
 
 ; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0
 ; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !0
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !1
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !1
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -102,7 +100,7 @@ entry:
 
 ; OPT-LABEL: @high_alignment(
 ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
-define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [8 x i32], align 16
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -127,7 +125,7 @@ entry:
 ; OPT: alloca [5 x i32]
 
 ; SI-NOT: ds_write
-define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -162,7 +160,7 @@ entry:
 ; SI-NOT: v_movrel
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -191,7 +189,7 @@ entry:
 ; R600-NOT: MOVA_INT
 ; SI-NOT: v_movrel
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -227,11 +225,15 @@ for.end:
 
 ; R600: MOVA_INT
 
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x68,0xe0
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:2 ; encoding: [0x02,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:6 ; encoding: [0x06,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x68,0xe0
 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort.
-; SI-PROMOTE: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+
+; SI-PROMOTE: s_load_dword [[IDX:s[0-9]+]]
+; SI-PROMOTE: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
+; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -249,12 +251,12 @@ entry:
 
 ; R600: MOVA_INT
 
-; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding:
-; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding:
+; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding:
+; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding:
 
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x60,0xe0
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding: [0x01,0x00,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -277,7 +279,7 @@ entry:
 ;
 ; A total of 5 bytes should be allocated and used.
 ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -301,7 +303,7 @@ entry:
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -315,7 +317,7 @@ entry:
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -328,7 +330,7 @@ entry:
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -343,7 +345,7 @@ entry:
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -356,7 +358,7 @@ entry:
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -369,7 +371,7 @@ entry:
   ret void
 }
 
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -390,7 +392,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
@@ -406,7 +408,7 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; OPT-LABEL: @pointer_typed_alloca(
 ; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
-define void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
@@ -458,7 +460,7 @@ entry:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x i32>]
   %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
@@ -502,7 +504,7 @@ define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x float>]
   %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x float>, <16 x float>* %tmp0
@@ -518,7 +520,7 @@ define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [16 x <2 x float>]
   %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <2 x float>, <2 x float>* %tmp0
@@ -529,7 +531,7 @@ define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
 ; OPT-LABEL: @direct_alloca_read_0xi32(
 ; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
 ; OPT: load [0 x i32], [0 x i32] addrspace(3)*
-define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [0 x i32]
   store [0 x i32] [], [0 x i32]* %tmp
@@ -541,7 +543,7 @@ entry:
 ; OPT-LABEL: @direct_alloca_read_1xi32(
 ; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
 ; OPT: load [1 x i32], [1 x i32] addrspace(3)*
-define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [1 x i32]
   store [1 x i32] [i32 0], [1 x i32]* %tmp
@@ -553,6 +555,8 @@ entry:
 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
 
 ; HSAOPT: !0 = !{}
-; HSAOPT: !1 = !{i32 0, i32 2048}
+; HSAOPT: !1 = !{i32 0, i32 257}
+; HSAOPT: !2 = !{i32 0, i32 256}
 
-; NOHSAOPT: !0 = !{i32 0, i32 2048}
+; NOHSAOPT: !0 = !{i32 0, i32 257}
+; NOHSAOPT: !1 = !{i32 0, i32 256}
diff --git a/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index e515ca00d184..187320805c11 100644
--- a/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -12,7 +12,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
-define void @ngroups_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -27,7 +27,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
-define void @ngroups_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
-define void @ngroups_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-define void @global_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -72,7 +72,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-define void @global_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -87,7 +87,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-define void @global_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -102,7 +102,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
-define void @local_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -117,7 +117,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
-define void @local_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -132,7 +132,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
-define void @local_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -153,7 +153,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -165,7 +165,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
 
 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
-define void @tgid_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -181,7 +181,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -194,7 +194,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_x_legacy:
 ; GCN-NOHSA: buffer_store_dword v0
-define void @tidig_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -208,7 +208,7 @@ entry:
 ; FUNC-LABEL: {{^}}tidig_y_legacy:
 
 ; GCN-NOHSA: buffer_store_dword v1
-define void @tidig_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -221,7 +221,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_z_legacy:
 ; GCN-NOHSA: buffer_store_dword v2
-define void @tidig_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll
index dde5f8c21769..2aec03aff8a3 100644
--- a/test/CodeGen/AMDGPU/and-gcn.ll
+++ b/test/CodeGen/AMDGPU/and-gcn.ll
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}v_and_i64_br:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
index 5d9dcf64debf..c356f8b87cfc 100644
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -31,7 +31,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -42,7 +42,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}s_and_i32:
 ; SI: s_and_b32
-define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, %b
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -50,7 +50,7 @@ define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 
 ; FUNC-LABEL: {{^}}s_and_constant_i32:
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
-define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -66,7 +66,7 @@ define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
 ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
 
   ; Just to stop future replacement of copy to vgpr + store with VALU op.
@@ -83,7 +83,7 @@ define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32
 ; SI: s_add_i32
 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
   %foo = add i32 %and, 1234567
   %bar = add i32 %foo, %b
@@ -93,7 +93,7 @@ define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32
 
 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr:
 ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -109,7 +109,7 @@ define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -123,7 +123,7 @@ define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
+define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -135,7 +135,7 @@ define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 
 ; FUNC-LABEL: {{^}}v_and_constant_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
-define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -144,7 +144,7 @@ define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr)
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
-define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 64
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
-define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, -16
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -162,7 +162,7 @@ define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1
 
 ; FUNC-LABEL: {{^}}s_and_i64
 ; SI: s_and_b64
-define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and = and i64 %a, %b
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -171,7 +171,7 @@ define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; FIXME: Should use SGPRs
 ; FUNC-LABEL: {{^}}s_and_i1:
 ; SI: v_and_b32
-define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
   %and = and i1 %a, %b
   store i1 %and, i1 addrspace(1)* %out
   ret void
@@ -181,7 +181,7 @@ define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
 ; SI: buffer_store_dwordx2
-define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 549756338176
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -191,7 +191,7 @@ define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
 ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
-define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and0 = and i64 %a, 549756338176
   %and1 = and i64 %b, 549756338176
   store volatile i64 %and0, i64 addrspace(1)* %out
@@ -205,7 +205,7 @@ define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -223,7 +223,7 @@ define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
   %shl.a = shl i64 %a, 1
   %shl.b = shl i64 %b, 1
   %and0 = and i64 %shl.a, 62
@@ -238,7 +238,7 @@ define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64
 ; FUNC-LABEL: {{^}}v_and_i64:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %and = and i64 %a, %b
@@ -250,7 +250,7 @@ define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
 ; SI: buffer_store_dwordx2
-define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1231231234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -268,7 +268,7 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
 ; SI: buffer_store_dwordx2
 ; SI: buffer_store_dwordx2
-define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 1231231234567
@@ -288,7 +288,7 @@ define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
-define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 63
@@ -304,7 +304,7 @@ define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspac
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -317,7 +317,7 @@ define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -331,7 +331,7 @@ define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %apt
 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, -8
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -344,7 +344,7 @@ define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
 ; SI-NOT: and
 ; SI: buffer_store_dword
-define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -358,7 +358,7 @@ define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
-define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
   %shl = shl i64 %a, 1
   %and = and i64 %shl, 64
   %add = add i64 %and, %b
@@ -372,7 +372,7 @@ define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrsp
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -387,7 +387,7 @@ define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4607182418800017408
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -402,7 +402,7 @@ define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13830554455654793216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -417,7 +417,7 @@ define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4602678819172646912
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -432,7 +432,7 @@ define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13826050856027422720
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -445,7 +445,7 @@ define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4611686018427387904
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -458,7 +458,7 @@ define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13835058055282163712
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -473,7 +473,7 @@ define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4616189618054758400
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -488,7 +488,7 @@ define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13839561654909534208
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -505,7 +505,7 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1082130432
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -518,7 +518,7 @@ define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, -1065353216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -531,7 +531,7 @@ define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4647714815446351872
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -544,7 +544,7 @@ define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrs
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13871086852301127680
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 084a6933da26..e2620ce353c6 100644
--- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -11,22 +11,22 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
 
-; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -34,8 +34,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -43,15 +43,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -59,8 +59,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -68,8 +68,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
@@ -79,29 +79,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -109,8 +109,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -118,8 +118,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -129,8 +129,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -146,8 +146,8 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
-define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
+define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -155,8 +155,8 @@ define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
-define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
+define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -164,58 +164,58 @@ define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
 }
 
 ; No-op addrspacecast should not use queue ptr
-; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   %ld = load volatile i32, i32 addrspace(2)* %ftos
   ret void
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index a4e7bb67d507..09750da4cb8c 100644
--- a/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -12,22 +12,22 @@ declare i32 @llvm.r600.read.local.size.x() #0
 declare i32 @llvm.r600.read.local.size.y() #0
 declare i32 @llvm.r600.read.local.size.z() #0
 
-; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.r600.read.tgid.y()
@@ -35,8 +35,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -44,15 +44,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -60,8 +60,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -69,8 +69,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   %val2 = call i32 @llvm.r600.read.tgid.z()
@@ -80,29 +80,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tgid.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -110,8 +110,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.y()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -119,8 +119,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -130,8 +130,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -147,25 +147,25 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
diff --git a/test/CodeGen/AMDGPU/anonymous-gv.ll b/test/CodeGen/AMDGPU/anonymous-gv.ll
index f37b0f3382f4..04fbe2ae1f94 100644
--- a/test/CodeGen/AMDGPU/anonymous-gv.ll
+++ b/test/CodeGen/AMDGPU/anonymous-gv.ll
@@ -6,13 +6,13 @@
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1
 ; CHECK: s_endpgm
-define void @test() {
+define amdgpu_kernel void @test() {
   store i32 1, i32 addrspace(1)* @0
   ret void
 }
 
 ; CHECK-LABEL: {{^}}__unnamed_2:
 ; CHECK: s_endpgm
-define void @1() {
+define amdgpu_kernel void @1() {
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
new file mode 100644
index 000000000000..c61c23222bc7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32:
+; GCN: {{buffer|flat}}_load_dwordx4
+; GCN-DAG: {{buffer|flat}}_load_dwordx4
+; GCN-DAG: {{buffer|flat}}_load_dword
+
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(<8 x i8> addrspace(1)* nocapture readonly %arg, <16 x i8> addrspace(1)* %arg1) local_unnamed_addr #0 {
+bb:
+  %tmp = bitcast <8 x i8> addrspace(1)* %arg to <16 x i8> addrspace(1)*
+  %tmp2 = load <16 x i8>, <16 x i8> addrspace(1)* %tmp, align 16
+  %tmp3 = extractelement <16 x i8> %tmp2, i64 4
+  %tmp6 = extractelement <16 x i8> %tmp2, i64 11
+  %tmp10 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %arg, i64 2
+  %tmp11 = bitcast <8 x i8> addrspace(1)* %tmp10 to <16 x i8> addrspace(1)*
+  %tmp12 = load <16 x i8>, <16 x i8> addrspace(1)* %tmp11, align 16
+  %tmp13 = extractelement <16 x i8> %tmp12, i64 7
+  %tmp17 = extractelement <16 x i8> %tmp12, i64 12
+  %tmp21 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %arg, i64 4
+  %tmp22 = bitcast <8 x i8> addrspace(1)* %tmp21 to <16 x i8> addrspace(1)*
+  %tmp23 = load <16 x i8>, <16 x i8> addrspace(1)* %tmp22, align 16
+  %tmp24 = extractelement <16 x i8> %tmp23, i64 3
+  %tmp1 = insertelement <16 x i8> undef, i8 %tmp3, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp1, i8 0, i32 3
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 0, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp5, i8 %tmp6, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 0, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp13, i32 7
+  %tmp14 = insertelement <16 x i8> %tmp9, i8 0, i32 8
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp17, i32 9
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 0, i32 10
+  %tmp18 = insertelement <16 x i8> %tmp16, i8 0, i32 11
+  %tmp19 = insertelement <16 x i8> %tmp18, i8 %tmp24, i32 12
+  store <16 x i8> %tmp19, <16 x i8> addrspace(1)* %arg1, align 1
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/anyext.ll b/test/CodeGen/AMDGPU/anyext.ll
index 87b4c86427c8..3f220c408412 100644
--- a/test/CodeGen/AMDGPU/anyext.ll
+++ b/test/CodeGen/AMDGPU/anyext.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 
 ; GCN-LABEL: {{^}}anyext_i1_i32:
 ; GCN: v_cndmask_b32_e64
-define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tmp = icmp eq i32 %cond, 0
   %tmp1 = zext i1 %tmp to i8
@@ -22,7 +22,7 @@ entry:
 ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]]
 ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]]
 ; VI: buffer_store_dword [[AND]]
-define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
+define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index f190bd0cb01e..daa3442097cf 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -12,11 +12,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 
 ; SI-LABEL: {{^}}test_private_array_ptr_calc:
 
-; FIXME: We end up with zero argument for ADD, because
-; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
-; with the appropriate offset.  We should fold this into the store.
-
-; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}}
+; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}}
 ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
 ; SI-ALLOCA: s_barrier
 ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
@@ -28,7 +24,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 
 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
+define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
   %alloca = alloca [16 x i32], align 16
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index b914edf2928e..ddeffc10a089 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
diff --git a/test/CodeGen/AMDGPU/ashr.v2i16.ll b/test/CodeGen/AMDGPU/ashr.v2i16.ll
new file mode 100644
index 000000000000..96a5e3b23758
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -0,0 +1,161 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+
+; GCN-LABEL: {{^}}s_ashr_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
+
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+; CI: v_ashrrev_i32_e32
+; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_or_b32_e32
+define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+  %result = ashr <2 x i16> %lhs, %rhs
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_ashr_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+
+; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
+; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], [[RHS]]
+; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]]
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
+  %result = ashr <2 x i16> %a, %b
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_v_s_v2i16:
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = ashr <2 x i16> %vgpr, %sgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_s_v_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = ashr <2 x i16> %sgpr, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_imm_v_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4
+define amdgpu_kernel void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = ashr <2 x i16> <i16 -4, i16 -4>, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_v_imm_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]]
+define amdgpu_kernel void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = ashr <2 x i16> %vgpr, <i16 8, i16 8>
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_ashr_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
+  %result = ashr <4 x i16> %a, %b
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_v_imm_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %result = ashr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 25eae0b41ae4..4f9526ddab55 100644
--- a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -12,7 +12,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -33,7 +33,7 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
 ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
@@ -45,7 +45,7 @@ define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrs
 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -65,7 +65,7 @@ define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i3
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -84,7 +84,7 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw
 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
 ; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
diff --git a/test/CodeGen/AMDGPU/atomic_load_add.ll b/test/CodeGen/AMDGPU/atomic_load_add.ll
index 4b014e09b630..e0fe6641fa11 100644
--- a/test/CodeGen/AMDGPU/atomic_load_add.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_add.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_local:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32
-define void @atomic_add_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@ define void @atomic_add_local(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@ define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_add_ret_local:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32
-define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/atomic_load_sub.ll b/test/CodeGen/AMDGPU/atomic_load_sub.ll
index c6e5b1136d7c..a0275893919a 100644
--- a/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32
-define void @atomic_sub_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@ define void @atomic_sub_local(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@ define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32
-define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index cab377feacb2..63a6f6a8d32c 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -5,7 +5,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_64() #0 {
+define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
   ret void
 }
@@ -16,7 +16,7 @@ attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_128() #1 {
+define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
   ret void
 }
@@ -27,7 +27,7 @@ attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_128_max_128() #2 {
+define amdgpu_kernel void @min_128_max_128() #2 {
 entry:
   ret void
 }
@@ -39,7 +39,7 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 32
 @var = addrspace(1) global float 0.0
-define void @min_1024_max_2048() #3 {
+define amdgpu_kernel void @min_1024_max_2048() #3 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index e4f6e72e6977..ac2f7b4a4a4b 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -4,25 +4,22 @@
 ; If spilling to smem, additional registers are used for the resource
 ; descriptor.
 
-; ALL-LABEL: {{^}}max_12_sgprs:
+; ALL-LABEL: {{^}}max_9_sgprs:
 
-; FIXME: Should be ablo to skip this copying of the private segment
-; buffer because all the SGPR spills are to VGPRs.
-
-; ALL: s_mov_b64 s[10:11], s[2:3]
-; ALL: s_mov_b64 s[8:9], s[0:1]
 ; ALL: SGPRBlocks: 1
-; ALL: NumSGPRsForWavesPerEU: 14
-define void @max_12_sgprs(i32 addrspace(1)* %out1,
+; ALL: NumSGPRsForWavesPerEU: 9
+define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
 
                           i32 addrspace(1)* %out2,
                           i32 addrspace(1)* %out3,
                           i32 addrspace(1)* %out4,
-                          i32 %one, i32 %two, i32 %three, i32 %four) #0 {
+                          i32 addrspace(1)* %out5,
+                          i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
   store i32 %one, i32 addrspace(1)* %out1
   store i32 %two, i32 addrspace(1)* %out2
   store i32 %three, i32 addrspace(1)* %out3
   store i32 %four, i32 addrspace(1)* %out4
+  store i32 %five, i32 addrspace(1)* %out5
   ret void
 }
 
@@ -52,23 +49,26 @@ define void @max_12_sgprs(i32 addrspace(1)* %out1,
 
 ; TOSMEM: SGPRBlocks: 1
 ; TOSMEM: NumSGPRsForWavesPerEU: 16
-define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
                                         i32 addrspace(1)* %out2,
                                         i32 addrspace(1)* %out3,
                                         i32 addrspace(1)* %out4,
                                         i32 %one, i32 %two, i32 %three, i32 %four) #2 {
-  store volatile i32 0, i32* undef
   %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
-  store volatile i32 %x.0, i32 addrspace(1)* undef
   %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
-  store volatile i32 %x.0, i32 addrspace(1)* undef
   %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
-  store volatile i32 %x.0, i32 addrspace(1)* undef
   %x.3 = call i64 @llvm.amdgcn.dispatch.id()
-  store volatile i64 %x.3, i64 addrspace(1)* undef
   %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
   %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  store volatile i32 0, i32* undef
+  br label %stores
+
+stores:
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  store volatile i64 %x.3, i64 addrspace(1)* undef
+  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
   store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
 
   store i32 %one, i32 addrspace(1)* %out1
@@ -90,7 +90,7 @@ define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
 
 ; XALL: SGPRBlocks: 2
 ; XALL: NumSGPRsForWavesPerEU: 18
-;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
 ;                                        i32 addrspace(1)* %out2,
 ;                                        i32 addrspace(1)* %out3,
 ;                                        i32 addrspace(1)* %out4,
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll b/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
index 97feb7276b7d..979665ff0a80 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}max_20_vgprs:
 ; CHECK: VGPRBlocks: 4
 ; CHECK: NumVGPRsForWavesPerEU: 20
-define void @max_20_vgprs() #1 {
+define amdgpu_kernel void @max_20_vgprs() #1 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 4f4efccc2260..3dda73bc336e 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -4,9 +4,9 @@
 ; CHECK-LABEL: {{^}}empty_exactly_1:
 ; CHECK: SGPRBlocks: 12
 ; CHECK: VGPRBlocks: 32
-; CHECK: NumSGPRsForWavesPerEU: 97
+; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 129
-define void @empty_exactly_1() #0 {
+define amdgpu_kernel void @empty_exactly_1() #0 {
 entry:
   ret void
 }
@@ -16,9 +16,9 @@ attributes #0 = {"amdgpu-waves-per-eu"="1,1"}
 ; CHECK-LABEL: {{^}}empty_exactly_5:
 ; CHECK: SGPRBlocks: 12
 ; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 97
+; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_exactly_5() #1 {
+define amdgpu_kernel void @empty_exactly_5() #1 {
 entry:
   ret void
 }
@@ -30,7 +30,7 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_exactly_10() #2 {
+define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
   ret void
 }
@@ -42,7 +42,7 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_1() #3 {
+define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
   ret void
 }
@@ -54,7 +54,7 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_5() #4 {
+define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
   ret void
 }
@@ -66,7 +66,7 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_10() #5 {
+define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
   ret void
 }
@@ -78,9 +78,9 @@ attributes #5 = {"amdgpu-waves-per-eu"="10"}
 ; CHECK-LABEL: {{^}}empty_at_most_5:
 ; CHECK: SGPRBlocks: 12
 ; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 97
+; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_at_most_5() #6 {
+define amdgpu_kernel void @empty_at_most_5() #6 {
 entry:
   ret void
 }
@@ -92,7 +92,7 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_most_10() #7 {
+define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
   ret void
 }
@@ -106,7 +106,7 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_between_5_and_10() #8 {
+define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
   ret void
 }
@@ -120,7 +120,7 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 ; CHECK: VGPRBlocks: 5
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 24
-define void @exactly_10() #9 {
+define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/test/CodeGen/AMDGPU/attr-unparseable.ll b/test/CodeGen/AMDGPU/attr-unparseable.ll
index 0282bc34c0ee..17adb89900cd 100644
--- a/test/CodeGen/AMDGPU/attr-unparseable.ll
+++ b/test/CodeGen/AMDGPU/attr-unparseable.ll
@@ -1,56 +1,56 @@
 ; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_0() #0 {
+define amdgpu_kernel void @unparseable_single_0() #0 {
 entry:
   ret void
 }
 attributes #0 = {"amdgpu-num-sgpr"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_1() #1 {
+define amdgpu_kernel void @unparseable_single_1() #1 {
 entry:
   ret void
 }
 attributes #1 = {"amdgpu-num-sgpr"="k"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_2() #2 {
+define amdgpu_kernel void @unparseable_single_2() #2 {
 entry:
   ret void
 }
 attributes #2 = {"amdgpu-num-sgpr"="1,2"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_0() #3 {
+define amdgpu_kernel void @unparseable_pair_0() #3 {
 entry:
   ret void
 }
 attributes #3 = {"amdgpu-flat-work-group-size"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_1() #4 {
+define amdgpu_kernel void @unparseable_pair_1() #4 {
 entry:
   ret void
 }
 attributes #4 = {"amdgpu-flat-work-group-size"="k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_2() #5 {
+define amdgpu_kernel void @unparseable_pair_2() #5 {
 entry:
   ret void
 }
 attributes #5 = {"amdgpu-flat-work-group-size"="1"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_3() #6 {
+define amdgpu_kernel void @unparseable_pair_3() #6 {
 entry:
   ret void
 }
 attributes #6 = {"amdgpu-flat-work-group-size"="1,k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_4() #7 {
+define amdgpu_kernel void @unparseable_pair_4() #7 {
 entry:
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/barrier-elimination.ll b/test/CodeGen/AMDGPU/barrier-elimination.ll
new file mode 100644
index 000000000000..c526baaab9cd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/barrier-elimination.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}unknown_wgs:
+; CHECK: s_barrier
+define amdgpu_kernel void @unknown_wgs() {
+  tail call void @llvm.amdgcn.s.barrier() #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}flat_wgs_attr_32_128:
+; CHECK: s_barrier
+define amdgpu_kernel void @flat_wgs_attr_32_128() #1 {
+  tail call void @llvm.amdgcn.s.barrier() #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}flat_wgs_attr_32_64:
+; CHECK: :
+; CHECK-NEXT: ; wave barrier
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @flat_wgs_attr_32_64() #2 {
+  tail call void @llvm.amdgcn.s.barrier() #0
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #0
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="32,128" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="32,64" }
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
index 104dd45e8a1a..e245e4296df2 100644
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -8,17 +8,14 @@
 ; GCNNOOPT: v_writelane_b32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
 
-
-; GCN: ; BB#1
 ; GCNNOOPT: v_readlane_b32
 ; GCNNOOPT: v_readlane_b32
 ; GCN: buffer_store_dword
-; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; TODO: This waitcnt can be eliminated
+; GCNNOOPT: s_endpgm
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
+define amdgpu_kernel void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %end
 
@@ -42,7 +39,7 @@ end:
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
   %cmp0 = icmp ne i1 %val, 0
   br i1 %cmp0, label %store, label %end
 
diff --git a/test/CodeGen/AMDGPU/basic-loop.ll b/test/CodeGen/AMDGPU/basic-loop.ll
index f0263caf5d6b..de45190cdaa5 100644
--- a/test/CodeGen/AMDGPU/basic-loop.ll
+++ b/test/CodeGen/AMDGPU/basic-loop.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_loop:
-define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+define amdgpu_kernel void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
 entry:
   br label %loop.body
 
diff --git a/test/CodeGen/AMDGPU/bfe-patterns.ll b/test/CodeGen/AMDGPU/bfe-patterns.ll
new file mode 100644
index 000000000000..c23cc1c88b52
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -0,0 +1,163 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}v_ubfe_sub_i32:
+; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
+; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
+define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
+  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %src = load volatile i32, i32 addrspace(1)* %in0.gep
+  %width = load volatile i32, i32 addrspace(1)* %in0.gep
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = lshr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_ubfe_sub_multi_use_shl_i32:
+; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
+; GCN: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
+
+; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
+; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
+
+; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
+; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
+
+; GCN: [[BFE]]
+; GCN: [[SHL]]
+define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
+  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %src = load volatile i32, i32 addrspace(1)* %in0.gep
+  %width = load volatile i32, i32 addrspace(1)* %in0.gep
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = lshr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  store volatile i32 %shl, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_ubfe_sub_i32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: s_load_dword [[WIDTH:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
+; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
+define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = lshr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: s_load_dword [[WIDTH:s[0-9]+]]
+; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
+; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
+; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
+define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = lshr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  store volatile i32 %shl, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_sbfe_sub_i32:
+; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
+; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
+define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
+  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %src = load volatile i32, i32 addrspace(1)* %in0.gep
+  %width = load volatile i32, i32 addrspace(1)* %in0.gep
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = ashr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_sbfe_sub_multi_use_shl_i32:
+; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
+; GCN: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
+
+; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
+; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
+
+; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
+; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
+
+; GCN: [[BFE]]
+; GCN: [[SHL]]
+define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
+  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %src = load volatile i32, i32 addrspace(1)* %in0.gep
+  %width = load volatile i32, i32 addrspace(1)* %in0.gep
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = ashr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  store volatile i32 %shl, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_sbfe_sub_i32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: s_load_dword [[WIDTH:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
+; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
+define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = ashr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: s_load_dword [[WIDTH:s[0-9]+]]
+; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
+; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
+; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
+define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = ashr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  store volatile i32 %shl, i32 addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/bfe_uint.ll b/test/CodeGen/AMDGPU/bfe_uint.ll
index 32e3fc26106f..2c8c9a5ec932 100644
--- a/test/CodeGen/AMDGPU/bfe_uint.ll
+++ b/test/CodeGen/AMDGPU/bfe_uint.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: {{^}}bfe_def:
 ; CHECK: BFE_UINT
-define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 5
   %1 = and i32 %0, 15 ; 0xf
@@ -17,7 +17,7 @@ entry:
 
 ; CHECK: {{^}}bfe_shift:
 ; CHECK-NOT: BFE_UINT
-define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 16
   %1 = and i32 %0, 65535 ; 0xffff
diff --git a/test/CodeGen/AMDGPU/bfi_int.ll b/test/CodeGen/AMDGPU/bfi_int.ll
index 5156137fd78a..7870e5f378d3 100644
--- a/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/test/CodeGen/AMDGPU/bfi_int.ll
@@ -9,7 +9,7 @@
 ; R600: BFI_INT
 ; SI:   @bfi_def
 ; SI:   v_bfi_b32
-define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %x, -1
   %1 = and i32 %z, %0
@@ -25,7 +25,7 @@ entry:
 ; R600: BFI_INT
 ; SI:   @bfi_sha256_ch
 ; SI:   v_bfi_b32
-define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %y, %z
   %1 = and i32 %x, %0
@@ -42,7 +42,7 @@ entry:
 ; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
 ; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
 
-define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = and i32 %x, %z
   %1 = or i32 %x, %z
diff --git a/test/CodeGen/AMDGPU/bfm.ll b/test/CodeGen/AMDGPU/bfm.ll
index 790458d0d60c..5673995588da 100644
--- a/test/CodeGen/AMDGPU/bfm.ll
+++ b/test/CodeGen/AMDGPU/bfm.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}bfm_pattern:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   %c = shl i32 %b, %y
@@ -14,7 +14,7 @@ define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 
 ; FUNC-LABEL: {{^}}bfm_pattern_simple:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
-define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   store i32 %b, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/big_alu.ll b/test/CodeGen/AMDGPU/big_alu.ll
index 0ab22b350f50..51387c8b79cb 100644
--- a/test/CodeGen/AMDGPU/big_alu.ll
+++ b/test/CodeGen/AMDGPU/big_alu.ll
@@ -2,7 +2,7 @@
 
 ; This test ensures that R600 backend can handle ifcvt properly
 
-define amdgpu_ps void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) {
+define amdgpu_ps void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 {
 main_body:
   %tmp = extractelement <4 x float> %reg0, i32 0
   %tmp1 = extractelement <4 x float> %reg0, i32 1
@@ -224,28 +224,31 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %result.i = fadd float %mul.i, %one.sub.ac.i
   %tmp204 = fadd float %result.i, 0x3FF4CCCCC0000000
   %tmp205 = fmul float %tmp204, 0x3FE1C71C80000000
-  %tmp206 = call float @llvm.AMDGPU.clamp.f32(float %tmp205, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp205, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
   %tmp207 = fadd float %result.i, 0x3FF4CCCCC0000000
   %tmp208 = fmul float %tmp207, 0x3FE1C71C80000000
-  %tmp209 = call float @llvm.AMDGPU.clamp.f32(float %tmp208, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i15 = call float @llvm.maxnum.f32(float %tmp208, float 0.000000e+00)
+  %clamp.i16 = call float @llvm.minnum.f32(float %max.0.i15, float 1.000000e+00)
   %tmp210 = fadd float %result.i, 2.000000e+00
   %tmp211 = fmul float %tmp210, 0x3FD611A7A0000000
-  %tmp212 = call float @llvm.AMDGPU.clamp.f32(float %tmp211, float 0.000000e+00, float 1.000000e+00)
-  %tmp213 = fmul float 2.000000e+00, %tmp206
+  %max.0.i13 = call float @llvm.maxnum.f32(float %tmp211, float 0.000000e+00)
+  %clamp.i14 = call float @llvm.minnum.f32(float %max.0.i13, float 1.000000e+00)
+  %tmp213 = fmul float 2.000000e+00, %clamp.i
   %tmp214 = fsub float -0.000000e+00, %tmp213
   %tmp215 = fadd float 3.000000e+00, %tmp214
-  %tmp216 = fmul float %tmp206, %tmp215
-  %tmp217 = fmul float %tmp206, %tmp216
-  %tmp218 = fmul float 2.000000e+00, %tmp209
+  %tmp216 = fmul float %clamp.i, %tmp215
+  %tmp217 = fmul float %clamp.i, %tmp216
+  %tmp218 = fmul float 2.000000e+00, %clamp.i16
   %tmp219 = fsub float -0.000000e+00, %tmp218
   %tmp220 = fadd float 3.000000e+00, %tmp219
-  %tmp221 = fmul float %tmp209, %tmp220
-  %tmp222 = fmul float %tmp209, %tmp221
-  %tmp223 = fmul float 2.000000e+00, %tmp212
+  %tmp221 = fmul float %clamp.i16, %tmp220
+  %tmp222 = fmul float %clamp.i16, %tmp221
+  %tmp223 = fmul float 2.000000e+00, %clamp.i14
   %tmp224 = fsub float -0.000000e+00, %tmp223
   %tmp225 = fadd float 3.000000e+00, %tmp224
-  %tmp226 = fmul float %tmp212, %tmp225
-  %tmp227 = fmul float %tmp212, %tmp226
+  %tmp226 = fmul float %clamp.i14, %tmp225
+  %tmp227 = fmul float %clamp.i14, %tmp226
   %tmp228 = fmul float %tmp26, 0x3F368B5CC0000000
   %tmp229 = fmul float %tmp27, 0x3F368B5CC0000000
   %tmp230 = insertelement <4 x float> undef, float %tmp228, i32 0
@@ -282,28 +285,31 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %tmp261 = fmul float %tmp257, 0.000000e+00
   %tmp262 = fadd float %result.i, 0x3FF4CCCCC0000000
   %tmp263 = fmul float %tmp262, 0x3FE1C71C80000000
-  %tmp264 = call float @llvm.AMDGPU.clamp.f32(float %tmp263, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i11 = call float @llvm.maxnum.f32(float %tmp263, float 0.000000e+00)
+  %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00)
   %tmp265 = fadd float %result.i, 0x3FF4CCCCC0000000
   %tmp266 = fmul float %tmp265, 0x3FE1C71C80000000
-  %tmp267 = call float @llvm.AMDGPU.clamp.f32(float %tmp266, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i9 = call float @llvm.maxnum.f32(float %tmp266, float 0.000000e+00)
+  %clamp.i10 = call float @llvm.minnum.f32(float %max.0.i9, float 1.000000e+00)
   %tmp268 = fadd float %result.i, 2.000000e+00
   %tmp269 = fmul float %tmp268, 0x3FD611A7A0000000
-  %tmp270 = call float @llvm.AMDGPU.clamp.f32(float %tmp269, float 0.000000e+00, float 1.000000e+00)
-  %tmp271 = fmul float 2.000000e+00, %tmp264
+  %max.0.i7 = call float @llvm.maxnum.f32(float %tmp269, float 0.000000e+00)
+  %clamp.i8 = call float @llvm.minnum.f32(float %max.0.i7, float 1.000000e+00)
+  %tmp271 = fmul float 2.000000e+00, %clamp.i12
   %tmp272 = fsub float -0.000000e+00, %tmp271
   %tmp273 = fadd float 3.000000e+00, %tmp272
-  %tmp274 = fmul float %tmp264, %tmp273
-  %tmp275 = fmul float %tmp264, %tmp274
-  %tmp276 = fmul float 2.000000e+00, %tmp267
+  %tmp274 = fmul float %clamp.i12, %tmp273
+  %tmp275 = fmul float %clamp.i12, %tmp274
+  %tmp276 = fmul float 2.000000e+00, %clamp.i10
   %tmp277 = fsub float -0.000000e+00, %tmp276
   %tmp278 = fadd float 3.000000e+00, %tmp277
-  %tmp279 = fmul float %tmp267, %tmp278
-  %tmp280 = fmul float %tmp267, %tmp279
-  %tmp281 = fmul float 2.000000e+00, %tmp270
+  %tmp279 = fmul float %clamp.i10, %tmp278
+  %tmp280 = fmul float %clamp.i10, %tmp279
+  %tmp281 = fmul float 2.000000e+00, %clamp.i8
   %tmp282 = fsub float -0.000000e+00, %tmp281
   %tmp283 = fadd float 3.000000e+00, %tmp282
-  %tmp284 = fmul float %tmp270, %tmp283
-  %tmp285 = fmul float %tmp270, %tmp284
+  %tmp284 = fmul float %clamp.i8, %tmp283
+  %tmp285 = fmul float %clamp.i8, %tmp284
   %tmp286 = fmul float %tmp26, 0x3F22DFD6A0000000
   %tmp287 = fmul float %tmp27, 0x3F22DFD6A0000000
   %tmp288 = insertelement <4 x float> undef, float %tmp286, i32 0
@@ -390,7 +396,8 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %tmp369 = fadd float %tmp368, %tmp367
   %tmp370 = fadd float %tmp369, 0xBFEFAE1480000000
   %tmp371 = fmul float %tmp370, 0xC023FFFFC0000000
-  %tmp372 = call float @llvm.AMDGPU.clamp.f32(float %tmp371, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp371, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
   %tmp373 = fsub float -0.000000e+00, %tmp339
   %tmp374 = fadd float %result.i, %tmp373
   %tmp375 = fadd float %tmp374, 0x3FBEB851E0000000
@@ -416,12 +423,13 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %tmp395 = fadd float %tmp394, %tmp393
   %tmp396 = fadd float %tmp395, 0xBFEFAE1480000000
   %tmp397 = fmul float %tmp396, 0xC0490001A0000000
-  %tmp398 = call float @llvm.AMDGPU.clamp.f32(float %tmp397, float 0.000000e+00, float 1.000000e+00)
-  %tmp399 = fmul float 2.000000e+00, %tmp372
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp397, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %tmp399 = fmul float 2.000000e+00, %clamp.i6
   %tmp400 = fsub float -0.000000e+00, %tmp399
   %tmp401 = fadd float 3.000000e+00, %tmp400
-  %tmp402 = fmul float %tmp372, %tmp401
-  %tmp403 = fmul float %tmp372, %tmp402
+  %tmp402 = fmul float %clamp.i6, %tmp401
+  %tmp403 = fmul float %clamp.i6, %tmp402
   %one.sub.a.i169 = fsub float 1.000000e+00, %tmp403
   %one.sub.ac.i170 = fmul float %one.sub.a.i169, %tmp349
   %mul.i171 = fmul float %tmp258, %tmp349
@@ -438,11 +446,11 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %one.sub.ac.i158 = fmul float %one.sub.a.i157, 0.000000e+00
   %mul.i159 = fmul float %tmp261, 0.000000e+00
   %result.i160 = fadd float %mul.i159, %one.sub.ac.i158
-  %tmp404 = fmul float 2.000000e+00, %tmp398
+  %tmp404 = fmul float 2.000000e+00, %clamp.i4
   %tmp405 = fsub float -0.000000e+00, %tmp404
   %tmp406 = fadd float 3.000000e+00, %tmp405
-  %tmp407 = fmul float %tmp398, %tmp406
-  %tmp408 = fmul float %tmp398, %tmp407
+  %tmp407 = fmul float %clamp.i4, %tmp406
+  %tmp408 = fmul float %clamp.i4, %tmp407
   %one.sub.a.i153 = fsub float 1.000000e+00, %tmp408
   %one.sub.ac.i154 = fmul float %one.sub.a.i153, %tmp375
   %mul.i155 = fmul float %tmp258, %tmp375
@@ -1157,12 +1165,13 @@ IF179:                                            ; preds = %ENDIF175
   %tmp882 = fadd float %tmp881, %tmp880
   %tmp883 = fadd float %tmp882, 0xBFEFAE1480000000
   %tmp884 = fmul float %tmp883, 0xC043FFFE20000000
-  %tmp885 = call float @llvm.AMDGPU.clamp.f32(float %tmp884, float 0.000000e+00, float 1.000000e+00)
-  %tmp886 = fmul float 2.000000e+00, %tmp885
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp884, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp886 = fmul float 2.000000e+00, %clamp.i2
   %tmp887 = fsub float -0.000000e+00, %tmp886
   %tmp888 = fadd float 3.000000e+00, %tmp887
-  %tmp889 = fmul float %tmp885, %tmp888
-  %tmp890 = fmul float %tmp885, %tmp889
+  %tmp889 = fmul float %clamp.i2, %tmp888
+  %tmp890 = fmul float %clamp.i2, %tmp889
   %one.sub.a.i41 = fsub float 1.000000e+00, %tmp890
   %one.sub.ac.i42 = fmul float %one.sub.a.i41, %tmp866
   %mul.i43 = fmul float %temp84.5, %tmp866
@@ -1288,25 +1297,14 @@ ENDIF178:                                         ; preds = %IF179, %ENDIF175
   ret void
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.r600.recipsqrt.clamped.f32(float) #0
-
-; Function Attrs: nounwind readonly
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.recipsqrt.clamped.f32(float) #1
 declare float @llvm.fabs.f32(float) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.exp2.f32(float) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
-
+declare float @llvm.exp2.f32(float) #1
 declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
 
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind readonly }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index 3a55870c2882..cf95f74afb84 100644
--- a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -11,7 +11,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
   %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -27,7 +27,7 @@ define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %ou
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -43,7 +43,7 @@ define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %ou
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
   store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out
 
@@ -59,7 +59,7 @@ define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %o
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
   %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -67,3 +67,27 @@ define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %o
   store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
+; GCN-NOT: store_dword
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
+  %bc = bitcast i64 %undef to <2 x i32>
+  store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
+; GCN-NOT: store_dword
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
+  %bc = bitcast i64 %undef to <2 x i32>
+  %elt1 = extractelement <2 x i32> %bc, i32 1
+  store volatile i32 %elt1, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
index f7dc1a9d37e8..3616ec1f45d3 100644
--- a/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
+++ b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}materialize_0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
@@ -16,7 +16,7 @@ define void @materialize_0_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) {
   store i64 0, i64 addrspace(1)* %out
   ret void
 }
@@ -24,7 +24,7 @@ define void @materialize_0_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_neg1_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) {
   store i32 -1, i32 addrspace(1)* %out
   ret void
 }
@@ -33,7 +33,7 @@ define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) {
   store i64 -1, i64 addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_signbit_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
   ret void
 }
@@ -50,7 +50,7 @@ define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) {
   store i64  -9223372036854775808, i64 addrspace(1)* %out
   ret void
 }
@@ -58,7 +58,7 @@ define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg16_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
   store i32 268435455, i32 addrspace(1)* %out
   ret void
 }
@@ -67,7 +67,7 @@ define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
   store i64  1152921504606846975, i64 addrspace(1)* %out
   ret void
 }
@@ -75,7 +75,7 @@ define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg17_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
   store i32 -134217729, i32 addrspace(1)* %out
   ret void
 }
@@ -84,7 +84,7 @@ define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
   store i64 -576460752303423489, i64 addrspace(1)* %out
   ret void
 }
@@ -92,7 +92,7 @@ define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_64_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
   store i32 33554432, i32 addrspace(1)* %out
   ret void
 }
@@ -101,7 +101,7 @@ define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
   store i64 144115188075855872, i64 addrspace(1)* %out
   ret void
 }
@@ -109,7 +109,7 @@ define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_65_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
   store i32 -2113929216, i32 addrspace(1)* %out
   ret void
 }
@@ -118,7 +118,7 @@ define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
   store i64 -9079256848778919936, i64 addrspace(1)* %out
   ret void
 }
@@ -126,7 +126,7 @@ define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_3_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
   store i32 -1073741824, i32 addrspace(1)* %out
   ret void
 }
@@ -135,7 +135,7 @@ define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
   store i64 -4611686018427387904, i64 addrspace(1)* %out
   ret void
 }
@@ -143,7 +143,7 @@ define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_1.0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
   store i32 508, i32 addrspace(1)* %out
   ret void
 }
@@ -152,70 +152,70 @@ define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
   store i64 508, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_0_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-define void @s_materialize_0_i32() {
+define amdgpu_kernel void @s_materialize_0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 0)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_1_i32() {
+define amdgpu_kernel void @s_materialize_1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_neg1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, -1{{$}}
-define void @s_materialize_neg1_i32() {
+define amdgpu_kernel void @s_materialize_neg1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_signbit_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_signbit_i32() {
+define amdgpu_kernel void @s_materialize_signbit_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2147483648)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_64_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 64{{$}}
-define void @s_materialize_rev_64_i32() {
+define amdgpu_kernel void @s_materialize_rev_64_i32() {
   call void asm sideeffect "; use $0", "s"(i32 33554432)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_65_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0x82000000{{$}}
-define void @s_materialize_rev_65_i32() {
+define amdgpu_kernel void @s_materialize_rev_65_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2113929216)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg16_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, -16{{$}}
-define void @s_materialize_rev_neg16_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg16_i32() {
   call void asm sideeffect "; use $0", "s"(i32 268435455)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg17_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0xf7ffffff{{$}}
-define void @s_materialize_rev_neg17_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg17_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -134217729)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_1.0_i32:
 ; GCN: s_movk_i32 s{{[0-9]+}}, 0x1fc{{$}}
-define void @s_materialize_rev_1.0_i32() {
+define amdgpu_kernel void @s_materialize_rev_1.0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 508)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll
index 43a4200cb3bd..539373f7bdeb 100644
--- a/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@@ -14,7 +14,7 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
 
 ; FUNC-LABEL: {{^}}s_brev_i16:
 ; SI: s_brev_b32 
-define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
+define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@ define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
 
 ; FUNC-LABEL: {{^}}v_brev_i16:
 ; SI: v_bfrev_b32_e32
-define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
   %val = load i16, i16 addrspace(1)* %valptr
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalia
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
 ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
   %val = load i32, i32 addrspace(1)* %valptr
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia
 ; FUNC-LABEL: {{^}}s_brev_v2i32:
 ; SI: s_brev_b32
 ; SI: s_brev_b32
-define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val)
 ; FUNC-LABEL: {{^}}v_brev_v2i32:
 ; SI: v_bfrev_b32_e32
 ; SI: v_bfrev_b32_e32
-define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
@@ -73,7 +73,7 @@ define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp
 }
 
 ; FUNC-LABEL: {{^}}s_brev_i64:
-define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
+define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
 
 ; FUNC-LABEL: {{^}}v_brev_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
   %val = load i64, i64 addrspace(1)* %valptr
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
@@ -89,14 +89,14 @@ define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia
 }
 
 ; FUNC-LABEL: {{^}}s_brev_v2i64:
-define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_brev_v2i64:
-define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/br_cc.f16.ll b/test/CodeGen/AMDGPU/br_cc.f16.ll
index 0072d384f217..b7a0c8738dfa 100644
--- a/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -1,27 +1,26 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-; GCN-LABEL: {{^}}br_cc_f16
+; GCN-LABEL: {{^}}br_cc_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
+; SI:  v_cmp_nlt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
 ; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
 ; GCN: s_cbranch_vccnz
 
 ; GCN: one{{$}}
-; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
-; SI: s_branch
-; VI: buffer_store_short
-; VI: s_endpgm
+; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[B_F32]]
+; GCN: buffer_store_short
+; GCN: s_endpgm
 
 ; GCN: two{{$}}
-; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
+; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16(
+define amdgpu_kernel void @br_cc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -40,29 +39,27 @@ two:
   ret void
 }
 
-; GCN-LABEL: {{^}}br_cc_f16_imm_a
-; SI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
+; GCN-LABEL: {{^}}br_cc_f16_imm_a:
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
-; SI: s_cbranch_vccz
+; SI:  v_cmp_nlt_f32_e32 vcc, 0.5, v[[B_F32]]
+; SI: s_cbranch_vccnz
 
 ; VI:  v_cmp_nlt_f16_e32 vcc, 0.5, v[[B_F16]]
 ; VI: s_cbranch_vccnz
 
-; VI: one{{$}}
+; GCN: one{{$}}
 ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}}
 
-; GCN: two{{$}}
-; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
-
-; SI: one{{$}}
 ; SI: buffer_store_short v[[A_F16]]
 ; SI: s_endpgm
 
-define void @br_cc_f16_imm_a(
+
+; GCN: two{{$}}
+; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
+
+define amdgpu_kernel void @br_cc_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -79,13 +76,12 @@ two:
   ret void
 }
 
-; GCN-LABEL: {{^}}br_cc_f16_imm_b
-; SI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
+; GCN-LABEL: {{^}}br_cc_f16_imm_b:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
+; SI:  v_cmp_ngt_f32_e32 vcc, 0.5, v[[A_F32]]
+
 ; VI:  v_cmp_ngt_f16_e32 vcc, 0.5, v[[A_F16]]
 ; GCN: s_cbranch_vccnz
 
@@ -96,7 +92,7 @@ two:
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16_imm_b(
+define amdgpu_kernel void @br_cc_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/branch-condition-and.ll b/test/CodeGen/AMDGPU/branch-condition-and.ll
index 94616a4be8fd..68b77ea3490e 100644
--- a/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -15,12 +15,16 @@
 ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
 ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
 ; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
-;
-; TODO: The following sequence is a bug (missing s_endpgm)!
-;
-; GCN: s_branch [[BB:BB[0-9]+_[0-9]+]]
-; GCN: [[BB]]:
-; GCN-NEXT: .Lfunc_end0:
+; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+
+; GCN-NEXT: [[BB5]]
+; GCN: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: .Lfunc_end
 define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
 bb:
   %tmp = fcmp ogt float %arg, 0.000000e+00
@@ -29,6 +33,7 @@ bb:
   br i1 %tmp3, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb
+  store volatile i32 4, i32 addrspace(3)* undef
   unreachable
 
 bb5:                                              ; preds = %bb
diff --git a/test/CodeGen/AMDGPU/branch-relax-spill.ll b/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 86b8dd89e7d0..ede15559c4ff 100644
--- a/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -5,7 +5,7 @@
 
 ; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot!
 
-define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 entry:
   %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0
   %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index 39505404a868..263059d4a6ed 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -26,7 +26,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
@@ -68,7 +68,7 @@ bb3:
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb0:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
@@ -108,7 +108,7 @@ bb3:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
 bb0:
   %cmp = fcmp oeq float %cnd, 0.0
   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
@@ -141,7 +141,7 @@ bb3:
 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = zext i32 %tid to i64
@@ -188,7 +188,7 @@ bb3:
 
 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
-define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   br label %bb2
 
@@ -243,7 +243,7 @@ bb3:
 ; GCN: buffer_store_dword [[BB4_K]]
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 bb0:
   %tmp = icmp ne i32 %arg1, 0
   br i1 %tmp, label %bb2, label %bb3
@@ -285,7 +285,7 @@ bb4:
 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
 ; GCN-NEXT: s_setpc_b64 vcc
 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 entry:
   br label %loop
 
@@ -335,8 +335,14 @@ loop:
 ; GCN-NEXT: ;;#ASMEND
 
 ; GCN-NEXT: [[BB3]]: ; %bb3
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
-define void @expand_requires_expand(i32 %cond0) #0 {
+define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 bb0:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp slt i32 %cond0, 0
@@ -356,6 +362,12 @@ bb2:
   br label %bb3
 
 bb3:
+; These NOPs prevent tail-duplication-based outlining
+; from firing, which defeats the need to expand the branches and this test.
+  call void asm sideeffect
+   "v_nop_e64", ""() #0
+  call void asm sideeffect
+   "v_nop_e64", ""() #0
   ret void
 }
 
@@ -385,8 +397,9 @@ bb3:
 
 ; GCN-NEXT: [[ENDIF]]: ; %endif
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
+; GCN-NEXT: s_sleep 5
 ; GCN-NEXT: s_endpgm
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
@@ -402,6 +415,9 @@ if_uniform:
   br label %endif
 
 endif:
+  ; layout can remove the split branch if it can copy the return block.
+  ; This call makes the return block long enough that it doesn't get copied.
+  call void @llvm.amdgcn.s.sleep(i32 5);
   ret void
 }
 
@@ -446,7 +462,7 @@ endif:
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN: buffer_store_dword
 ; GCN-NEXT: s_endpgm
-define void @analyze_mask_branch() #0 {
+define amdgpu_kernel void @analyze_mask_branch() #0 {
 entry:
   %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
   %cmp0 = fcmp ogt float %reg, 0.000000e+00
@@ -475,7 +491,8 @@ ret:
 
 ; GCN-LABEL: {{^}}long_branch_hang:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
+; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll
index c68951731098..d2dacd7c17b3 100644
--- a/test/CodeGen/AMDGPU/bswap.ll
+++ b/test/CodeGen/AMDGPU/bswap.ll
@@ -17,7 +17,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
 ; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
   store i32 %bswap, i32 addrspace(1)* %out, align 4
@@ -32,7 +32,7 @@ define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
@@ -53,7 +53,7 @@ define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@ define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
@@ -95,21 +95,21 @@ define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(
 
 ; FUNC-LABEL: {{^}}test_bswap_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
   store i64 %bswap, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
   store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
-define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
   store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
diff --git a/test/CodeGen/AMDGPU/build_vector.ll b/test/CodeGen/AMDGPU/build_vector.ll
index 0a5774c601d3..d77b0ab9fbb6 100644
--- a/test/CodeGen/AMDGPU/build_vector.ll
+++ b/test/CodeGen/AMDGPU/build_vector.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
 ; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
-define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ entry:
 ; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
 ; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
 ; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
-define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/call.ll b/test/CodeGen/AMDGPU/call.ll
index 6d101e1537cc..769c7bb3eee7 100644
--- a/test/CodeGen/AMDGPU/call.ll
+++ b/test/CodeGen/AMDGPU/call.ll
@@ -10,7 +10,7 @@
 
 declare i32 @external_function(i32) nounwind
 
-define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -25,7 +25,7 @@ define i32 @defined_function(i32 %x) nounwind noinline {
   ret i32 %y
 }
 
-define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -35,7 +35,7 @@ define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
diff --git a/test/CodeGen/AMDGPU/calling-conventions.ll b/test/CodeGen/AMDGPU/calling-conventions.ll
index 57adc8be6a99..677147b6f4e5 100644
--- a/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; Make sure we don't crash or assert on spir_kernel calling convention.
 
-; SI-LABEL: {{^}}kernel:
-; SI: s_endpgm
+; GCN-LABEL: {{^}}kernel:
+; GCN: s_endpgm
 define spir_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
@@ -11,10 +12,34 @@ entry:
 }
 
 ; FIXME: This is treated like a kernel
-; SI-LABEL: {{^}}func:
-; SI: s_endpgm
-define spir_func void @func(i32 addrspace(1)* %out) {
-entry:
-  store i32 0, i32 addrspace(1)* %out
-  ret void
+; XGCN-LABEL: {{^}}func:
+; XGCN: s_endpgm
+; define spir_func void @func(i32 addrspace(1)* %out) {
+; entry:
+;   store i32 0, i32 addrspace(1)* %out
+;   ret void
+; }
+
+; GCN-LABEL: {{^}}ps_ret_cc_f16:
+; SI: v_cvt_f16_f32_e32 v0, v0
+; SI: v_cvt_f32_f16_e32 v0, v0
+; SI: v_add_f32_e32 v0, 1.0, v0
+
+; VI: v_add_f16_e32 v0, 1.0, v0
+; VI: ; return
+define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
+  %add = fadd half %arg0, 1.0
+  ret half %add
+}
+
+; GCN-LABEL: {{^}}ps_ret_cc_inreg_f16:
+; SI: v_cvt_f16_f32_e32 v0, s0
+; SI: v_cvt_f32_f16_e32 v0, v0
+; SI: v_add_f32_e32 v0, 1.0, v0
+
+; VI: v_add_f16_e64 v0, s0, 1.0
+; VI: ; return
+define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
+  %add = fadd half %arg0, 1.0
+  ret half %add
 }
diff --git a/test/CodeGen/AMDGPU/captured-frame-index.ll b/test/CodeGen/AMDGPU/captured-frame-index.ll
index 49af159581f7..5fe1b2728506 100644
--- a/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -1,24 +1,24 @@
 ; RUN: llc -mtriple=amdgcn-- -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}store_fi_lifetime:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %b = alloca i8
-  call void @llvm.lifetime.start(i64 1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %b)
   store volatile i8* %b, i8* addrspace(1)* undef
-  call void @llvm.lifetime.end(i64 1, i8* %b)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %b)
   ret void
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_lds:
 ; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
 ; GCN: buffer_store_dword v{{[0-9]+}}, off,
-; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
-define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
   %tmp = alloca float
   store float 4.0, float *%tmp
   store float* %tmp, float* addrspace(3)* %ptr
@@ -27,18 +27,18 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
 
 ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
 
 ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO]]
 
-; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
-define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   store float 4.0, float* %tmp0
@@ -51,10 +51,10 @@ define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
 ; Same frame index is used multiple times in the store
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
-define void @stored_fi_to_self() #0 {
+; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
 
   ; Avoid optimizing everything out
@@ -66,14 +66,14 @@ define void @stored_fi_to_self() #0 {
 
 ; GCN-LABEL: {{^}}stored_fi_to_self_offset:
 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
-; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}}
+; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
 
-; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}}
-; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}}
-define void @stored_fi_to_self_offset() #0 {
+; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
+; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
+define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32]
   %tmp1 = alloca i32*
 
@@ -89,16 +89,16 @@ define void @stored_fi_to_self_offset() #0 {
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_fi:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
-; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
-; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-define void @stored_fi_to_fi() #0 {
+; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
+; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+define amdgpu_kernel void @stored_fi_to_fi() #0 {
   %tmp0 = alloca i32*
   %tmp1 = alloca i32*
   %tmp2 = alloca i32*
@@ -115,10 +115,10 @@ define void @stored_fi_to_fi() #0 {
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_global:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
   %tmp = alloca float
   store float 0.0, float *%tmp
   store float* %tmp, float* addrspace(1)* %ptr
@@ -127,16 +127,16 @@ define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
-; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   %tmp2 = alloca float
@@ -150,10 +150,10 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
 
 ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
 ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 
 ; FIXME: Re-initialize
-; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
 ; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]]
@@ -163,7 +163,7 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
 ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca [4096 x i32]
   %tmp1 = alloca [4096 x i32]
   %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
@@ -184,9 +184,9 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC_LO]], g1@gotpcrel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+4
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
 entry:
   %b = alloca i32, align 4
   %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4
@@ -196,8 +196,8 @@ entry:
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index 0d919bbf85e3..697f26b83a4d 100644
--- a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
 
 ; GCN-LABEL: {{^}}test_loop:
-; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
 ; GCN: s_endpgm
-define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   %cmp = icmp eq i32 %n, -1
   br i1 %cmp, label %for.exit, label %for.body
@@ -31,7 +31,7 @@ for.body:
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
-define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -52,7 +52,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_false:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -74,7 +74,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_undef:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -104,7 +104,7 @@ for.body:
 ; GCN: s_cbranch_vccnz [[LOOPBB]]
 ; GCN-NEXT: ; BB#2
 ; GCN-NEXT: s_endpgm
-define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
 entry:
   br label %for.body
 
diff --git a/test/CodeGen/AMDGPU/cf-stack-bug.ll b/test/CodeGen/AMDGPU/cf-stack-bug.ll
index 75b87e486226..53fe89730f3a 100644
--- a/test/CodeGen/AMDGPU/cf-stack-bug.ll
+++ b/test/CodeGen/AMDGPU/cf-stack-bug.ll
@@ -35,7 +35,7 @@
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested3:
-define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested3(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -68,7 +68,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested4:
-define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested4(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -109,7 +109,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested7:
-define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested7(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -174,7 +174,7 @@ end:
 ; BUG32: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested8:
-define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested8(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
diff --git a/test/CodeGen/AMDGPU/cf_end.ll b/test/CodeGen/AMDGPU/cf_end.ll
index c74ee22868d5..3c990e0a4bd6 100644
--- a/test/CodeGen/AMDGPU/cf_end.ll
+++ b/test/CodeGen/AMDGPU/cf_end.ll
@@ -4,6 +4,6 @@
 
 ; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
 ; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
-define void @eop() {
+define amdgpu_kernel void @eop() {
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 6db9a0761a01..cbdcf6aeaf42 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -11,7 +11,7 @@
 ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
 ; GCN: flat_load_dword
 ; GCN: {{^}}BB0_2:
-define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -36,14 +36,14 @@ done:
 ; OPT-CI-NOT: getelementptr
 ; OPT: br i1
 
-; OPT-CI: ptrtoint
-; OPT-CI: add
-; OPT-CI: inttoptr
+; OPT-CI: addrspacecast
+; OPT-CI: getelementptr
+; OPT-CI: bitcast
 ; OPT: br label
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
 ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -69,14 +69,14 @@ done:
 ; OPT-CI-NOT: getelementptr
 ; OPT: br i1
 
-; OPT-CI: ptrtoint
-; OPT-CI: add
-; OPT-CI: inttoptr
+; OPT-CI: addrspacecast
+; OPT-CI: getelementptr
+; OPT-CI: bitcast
 ; OPT: br label
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 2ed2857ff340..c1cf56e5058e 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -5,15 +5,17 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
 ; OPT-LABEL: @test_sink_global_small_offset_i32(
 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
 ; OPT-VI: getelementptr i32, i32 addrspace(1)* %in
 ; OPT: br i1
-; OPT-CI: ptrtoint
+; OPT-CI: getelementptr i8,
 
 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
 ; GCN: {{^}}BB0_2:
-define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
@@ -43,7 +45,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB1_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
@@ -70,7 +72,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
 ; GCN: {{^}}BB2_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
@@ -97,7 +99,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB3_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
@@ -122,14 +124,55 @@ done:
 ; OPT-LABEL: @test_sink_scratch_small_offset_i32(
 ; OPT-NOT:  getelementptr [512 x i32]
 ; OPT: br i1
-; OPT: ptrtoint
+; OPT: getelementptr i8,
 
 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
 ; GCN: s_and_saveexec_b64
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: {{^}}BB4_2:
-define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+entry:
+  %alloca = alloca [512 x i32], align 4
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %add.arg = add i32 %arg, 8
+  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1022
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  store volatile i32 123, i32* %alloca.gep
+  %tmp1 = load volatile i32, i32* %alloca.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  %load = load volatile i32, i32* %alloca.gep
+  store i32 %load, i32 addrspace(1)* %out.gep.1
+  br label %done
+
+done:
+  ret void
+}
+
+; This ends up not fitting due to the reserved 4 bytes at offset 0
+; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
+; OPT-NOT:  getelementptr [512 x i32]
+; OPT: br i1
+; OPT: getelementptr i8,
+
+; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
+; GCN: s_and_saveexec_b64
+; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
+; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
+; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: {{^BB[0-9]+}}_2:
+
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -165,8 +208,8 @@ done:
 ; GCN: s_and_saveexec_b64
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; GCN: {{^}}BB5_2:
-define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+; GCN: {{^BB[0-9]+}}_2:
+define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -197,8 +240,8 @@ done:
 ; GCN: s_and_saveexec_b64
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-; GCN: {{^}}BB6_2:
-define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+; GCN: {{^BB[0-9]+}}_2:
+define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %offset.ext = zext i32 %offset to i64
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
@@ -228,7 +271,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
@@ -257,7 +300,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
@@ -290,7 +333,7 @@ done:
 
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
@@ -322,7 +365,7 @@ done:
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
@@ -353,7 +396,7 @@ done:
 ; GCN: s_addc_u32
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
@@ -383,7 +426,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
@@ -421,7 +464,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
@@ -445,13 +488,13 @@ done:
 %struct.foo = type { [3 x float], [3 x float] }
 
 ; OPT-LABEL: @sink_ds_address(
-; OPT: ptrtoint %struct.foo addrspace(3)* %ptr to i64
+; OPT: getelementptr i8,
 
 ; GCN-LABEL: {{^}}sink_ds_address:
 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
-define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
   %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
@@ -476,9 +519,8 @@ bb34:
 ; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(
 ; OPT: br i1 %tmp0,
 ; OPT: if:
-; OPT: %sunkaddr = ptrtoint i8 addrspace(2)* %in to i64
-; OPT: %sunkaddr1 = add i64 %sunkaddr, 4095
-define void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+; OPT: getelementptr i8, {{.*}} 4095
+define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095
@@ -500,7 +542,141 @@ done:
   ret void
 }
 
+; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32(
+; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
+; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
+; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
+; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst
+define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32(
+; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
+; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
+; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
+; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic
+define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic
+  %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32(
+; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+; OPT: br i1
+; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
+define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
+  %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0
+  br label %endif
+
+endif:
+  %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ]
+  store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32(
+; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
+; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
+; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
+; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32(
+; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
+; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
+; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
+; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
+attributes #2 = { nounwind argmemonly }
diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index 066ef951cc31..53adf09026ec 100644
--- a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -36,7 +36,7 @@
 ; GCN: BB0_3:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = lshr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -76,7 +76,7 @@ ret:
 ; OPT: ret
 
 ; GCN-LABEL: {{^}}sink_sbfe_i32:
-define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = ashr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -120,20 +120,21 @@ ret:
 
 ; GCN-LABEL: {{^}}sink_ubfe_i16:
 ; GCN-NOT: lshr
-; VI: s_bfe_u32 s0, s0, 0xc0004
+; VI: s_load_dword [[ARG:s[0-9]+]], s[0:1], 0x2c
+; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004
 ; GCN: s_cbranch_scc1
 
 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
-; VI: s_and_b32 s0, s0, 0xff
+; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0xff
 
 ; GCN: BB2_2:
 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
-; VI: s_and_b32 s0, s0, 0x7f
+; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0x7f
 
 ; GCN: BB2_3:
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
 entry:
   %shr = lshr i16 %arg1, 4
   br i1 undef, label %bb0, label %bb1
@@ -186,7 +187,7 @@ ret:
 
 ; GCN: BB3_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 30
   br i1 undef, label %bb0, label %bb1
@@ -235,7 +236,7 @@ ret:
 
 ; GCN: BB4_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 15
   br i1 undef, label %bb0, label %bb1
@@ -282,7 +283,7 @@ ret:
 
 ; GCN: BB5_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 35
   br i1 undef, label %bb0, label %bb1
diff --git a/test/CodeGen/AMDGPU/clamp-modifier.ll b/test/CodeGen/AMDGPU/clamp-modifier.ll
new file mode 100644
index 000000000000..c3a7d5e14d87
--- /dev/null
+++ b/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -0,0 +1,222 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-NOT: [[A]]
+; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_multi_use_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[ADD]], [[ADD]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_dbg_use_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-NOT: [[A]]
+; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_dbg_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_neg_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_floor_f32_e32 [[FLOOR:v[0-9]+]], [[A]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[FLOOR]], -[[FLOOR]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_neg_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %floor = call float @llvm.floor.f32(float %a)
+  %neg.floor = fsub float -0.0, %floor
+  %max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_non_clamp_max_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_max_f32_e32 v{{[0-9]+}}, 0, [[ADD]]{{$}}
+define amdgpu_kernel void @v_non_clamp_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  store float %max, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f32_denormals:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_f32_denormals(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f16_denorm:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_add_f16_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
+define amdgpu_kernel void @v_clamp_add_src_f16_denorm(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %add = fadd half %a, 1.0
+  %max = call half @llvm.maxnum.f16(half %add, half 0.0)
+  %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
+  store half %clamp, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f16_no_denormals:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI-NOT: [[A]]
+; VI: v_add_f16_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
+define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(half addrspace(1)* %out, half addrspace(1)* %aptr) #3 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %add = fadd half %a, 1.0
+  %max = call half @llvm.maxnum.f16(half %add, half 0.0)
+  %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
+  store half %clamp, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_v2f32:
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[A]], 1.0 clamp{{$}}
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[B]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %out, i32 %tid
+  %a = load <2 x float>, <2 x float> addrspace(1)* %gep0
+  %add = fadd <2 x float> %a, <float 1.0, float 1.0>
+  %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %add, <2 x float> zeroinitializer)
+  %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
+  store <2 x float> %clamp, <2 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %add = fadd double %a, 1.0
+  %max = call double @llvm.maxnum.f64(double %add, double 0.0)
+  %clamp = call double @llvm.minnum.f64(double %max, double 1.0)
+  store double %clamp, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_mac_to_mad:
+; GCN: v_mad_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]}} clamp{{$}}
+define amdgpu_kernel void @v_clamp_mac_to_mad(float addrspace(1)* %out, float addrspace(1)* %aptr, float %a) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %b = load float, float addrspace(1)* %gep0
+
+  %mul = fmul float %a, %a
+  %add = fadd float %mul, %b
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  %res = fadd float %clamp, %b
+  store float %res, float addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.minnum.f64(double, double) #1
+declare double @llvm.maxnum.f64(double, double) #1
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.minnum.f16(half, half) #1
+declare half @llvm.maxnum.f16(half, half) #1
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-features"="+fp32-denormals" }
+attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8}
+!8 = !DIBasicType(name: "float", size: 32, align: 32)
+!9 = !DIExpression()
+!10 = !DILocation(line: 1, column: 42, scope: !5)
diff --git a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
new file mode 100644
index 000000000000..fbfd0fbf9308
--- /dev/null
+++ b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
@@ -0,0 +1,424 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands  %s -o - | FileCheck -check-prefix=GCN %s
+--- |
+  define amdgpu_ps void @v_max_self_clamp_not_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_clamp_omod_already_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_omod_mul_omod_already_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_omod_mul_clamp_already_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_omod_add_omod_already_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_omod_add_clamp_already_set_f32() #0 {
+    ret void
+  }
+
+  attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
+
+...
+---
+# GCN-LABEL: name: v_max_self_clamp_not_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec
+
+name:            v_max_self_clamp_not_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: v_clamp_omod_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec
+name:            v_clamp_omod_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
+---
+# Don't fold a mul that looks like an omod if itself has omod set
+
+# GCN-LABEL: name: v_omod_mul_omod_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit %exec
+name:            v_omod_mul_omod_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# Don't fold a mul that looks like an omod if itself has clamp set
+# This might be OK, but would require folding the clamp at the same time.
+# GCN-LABEL: name: v_omod_mul_clamp_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit %exec
+
+name:            v_omod_mul_clamp_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+# Don't fold a mul that looks like an omod if itself has omod set
+
+# GCN-LABEL: name: v_omod_add_omod_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit %exec
+name:            v_omod_add_omod_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# Don't fold a mul that looks like an omod if itself has clamp set
+# This might be OK, but would require folding the clamp at the same time.
+# GCN-LABEL: name: v_omod_add_clamp_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit %exec
+
+name:            v_omod_add_clamp_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll
new file mode 100644
index 000000000000..9735c7074be2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/clamp.ll
@@ -0,0 +1,529 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}v_clamp_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_neg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %fneg.a = fsub float -0.0, %a
+  %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negabs_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
+define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %fabs.a = call float @llvm.fabs.f32(float %a)
+  %fneg.fabs.a = fsub float -0.0, %fabs.a
+
+  %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negzero_f32:
+; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0
+define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float -0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  store volatile float %max, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
+define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %max = call half @llvm.maxnum.f16(half %a, half 0.0)
+  %med = call half @llvm.minnum.f16(half %max, half 1.0)
+
+  store half %med, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_neg_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
+
+; FIXME: Better to fold neg into max
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
+define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %fneg.a = fsub half -0.0, %a
+  %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
+  %med = call half @llvm.minnum.f16(half %max, half 1.0)
+
+  store half %med, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negabs_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
+
+; FIXME: Better to fold neg/abs into max
+
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
+define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %fabs.a = call half @llvm.fabs.f16(half %a)
+  %fneg.fabs.a = fsub half -0.0, %fabs.a
+
+  %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
+  %med = call half @llvm.minnum.f16(half %max, half 1.0)
+
+  store half %med, half addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: Do f64 instructions support clamp?
+; GCN-LABEL: {{^}}v_clamp_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %max = call double @llvm.maxnum.f64(double %a, double 0.0)
+  %med = call double @llvm.minnum.f64(double %max, double 1.0)
+
+  store double %med, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_neg_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %fneg.a = fsub double -0.0, %a
+  %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
+  %med = call double @llvm.minnum.f64(double %max, double 1.0)
+
+  store double %med, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negabs_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}}
+define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %fabs.a = call double @llvm.fabs.f64(double %a)
+  %fneg.fabs.a = fsub double -0.0, %fabs.a
+
+  %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
+  %med = call double @llvm.minnum.f64(double %max, double 1.0)
+
+  store double %med, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32
+define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
+define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
+define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
+define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; ---------------------------------------------------------------------
+; Test non-default behaviors enabling snans and disabling dx10_clamp
+; ---------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add  = fadd nnan float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
+define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
+define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
+define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
+define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
+define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.minnum.f64(double, double) #1
+declare double @llvm.maxnum.f64(double, double) #1
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.minnum.f16(half, half) #1
+declare half @llvm.maxnum.f16(half, half) #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
+attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
+attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
diff --git a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 9b4b61cf728a..208d97feb642 100644
--- a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -8,7 +8,7 @@ declare i1 @llvm.amdgcn.class.f32(float, i32)
 ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
+define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %c = icmp eq i32 %arg3, 0
@@ -35,7 +35,7 @@ bb2:
 ; GCN-NOT: vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
+define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
diff --git a/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll b/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll
index 7ff133b86e72..ef1b3d25f883 100644
--- a/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll
+++ b/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll
@@ -1,5 +1,4 @@
-; RUN: llc -march=amdgcn < %s | FileCheck %s
-; REQUIRES: asserts
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 ;
 ; This testcase used to cause the following crash:
 ;
@@ -18,14 +17,16 @@
 ;
 ; Test for a valid output:
 ; CHECK: image_sample_c_d_o
-
-target triple = "amdgcn--"
-
 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 {
 main_body:
-  %tmp = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg6, <2 x i32> %arg8)
-  %tmp23 = fadd float %tmp, 0xBFA99999A0000000
-  %tmp24 = fadd float %tmp, 0x3FA99999A0000000
+  %i.i = extractelement <2 x i32> %arg8, i32 0
+  %j.i = extractelement <2 x i32> %arg8, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 3, i32 0, i32 %arg6) #1
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 3, i32 0, i32 %arg6) #1
+  %tmp23 = fadd float %p2.i, 0xBFA99999A0000000
+  %tmp24 = fadd float %p2.i, 0x3FA99999A0000000
   %tmp25 = bitcast float %tmp23 to i32
   %tmp26 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp25, i32 1
   %tmp27 = insertelement <16 x i32> %tmp26, i32 undef, i32 2
@@ -35,7 +36,8 @@ main_body:
   %tmp31 = insertelement <16 x i32> %tmp30, i32 undef, i32 6
   %tmp32 = insertelement <16 x i32> %tmp31, i32 undef, i32 7
   %tmp33 = insertelement <16 x i32> %tmp32, i32 undef, i32 8
-  %tmp34 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp33, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %tmp33.bc = bitcast <16 x i32> %tmp33 to <16 x float>
+  %tmp34 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp33.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true)
   %tmp35 = extractelement <4 x float> %tmp34, i32 0
   %tmp36 = bitcast float %tmp24 to i32
   %tmp37 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp36, i32 1
@@ -46,7 +48,8 @@ main_body:
   %tmp42 = insertelement <16 x i32> %tmp41, i32 undef, i32 6
   %tmp43 = insertelement <16 x i32> %tmp42, i32 undef, i32 7
   %tmp44 = insertelement <16 x i32> %tmp43, i32 undef, i32 8
-  %tmp45 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp44, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %tmp44.bc = bitcast <16 x i32> %tmp44 to <16 x float>
+  %tmp45 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp44.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true)
   %tmp46 = extractelement <4 x float> %tmp45, i32 0
   %tmp47 = fmul float %tmp35, %tmp46
   %tmp48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp47, 14
@@ -54,9 +57,10 @@ main_body:
   ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp49
 }
 
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
-attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll
index 4c7875c3a039..3e1b76a1df09 100644
--- a/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -13,7 +13,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
 ; CHECK: ; NumVgprs: 8
-define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
+define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
   br i1 %cmpflag, label %loop, label %exit
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-deduce-ro-arg.ll b/test/CodeGen/AMDGPU/code-object-metadata-deduce-ro-arg.ll
new file mode 100644
index 000000000000..a33c3646e253
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-deduce-ro-arg.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck %s
+
+; CHECK:      - Name:            test_ro_arg
+; CHECK:        Args:
+; CHECK-NEXT: - Size:            8
+; CHECK-NEXT:   Align:           8
+; CHECK-NEXT:   ValueKind:       GlobalBuffer
+; CHECK-NEXT:   ValueType:       F32
+; CHECK-NEXT:   AccQual:         ReadOnly
+; CHECK-NEXT:   AddrSpaceQual:   Global
+; CHECK-NEXT:   IsConst:         true
+; CHECK-NEXT:   IsRestrict:      true
+; CHECK-NEXT:   TypeName:        'float*'
+
+; CHECK-NEXT: - Size:            8
+; CHECK-NEXT:   Align:           8
+; CHECK-NEXT:   ValueKind:       GlobalBuffer
+; CHECK-NEXT:   ValueType:       F32
+; CHECK-NEXT:   AccQual:         Default
+; CHECK-NEXT:   AddrSpaceQual:   Global
+; CHECK-NEXT:   TypeName:        'float*'
+
+define amdgpu_kernel void @test_ro_arg(float addrspace(1)* noalias readonly %in, float addrspace(1)* %out)
+    !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2
+    !kernel_arg_base_type !2 !kernel_arg_type_qual !3 {
+  ret void
+}
+
+!0 = !{i32 1, i32 1}
+!1 = !{!"none", !"none"}
+!2 = !{!"float*", !"float*"}
+!3 = !{!"const restrict", !""}
+
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
new file mode 100644
index 000000000000..88ba310a92ca
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
@@ -0,0 +1,1260 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-comd -amdgpu-verify-comd -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -amdgpu-dump-comd -amdgpu-verify-comd -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-comd -amdgpu-verify-comd -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+%struct.A = type { i8, float }
+%opencl.image1d_t = type opaque
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+%opencl.queue_t = type opaque
+%opencl.pipe_t = type opaque
+%struct.B = type { i32 addrspace(1)*}
+%opencl.clk_event_t = type opaque
+
+; CHECK: ---
+; CHECK:  Version: [ 1, 0 ]
+; CHECK:  Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+; CHECK:  Kernels:
+
+; CHECK:      - Name:            test_char
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          1
+; CHECK-NEXT:       Align:         1
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      char
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_char(i8 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
+    !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_ushort2
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     U16
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      ushort2
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_ushort2(<2 x i16> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10
+    !kernel_arg_base_type !10 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_int3
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          16
+; CHECK-NEXT:       Align:         16
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int3
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_int3(<3 x i32> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11
+    !kernel_arg_base_type !11 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_ulong4
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          32
+; CHECK-NEXT:       Align:         32
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     U64
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      ulong4
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_ulong4(<4 x i64> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12
+    !kernel_arg_base_type !12 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_half8
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          16
+; CHECK-NEXT:       Align:         16
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     F16
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      half8
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_half8(<8 x half> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13
+    !kernel_arg_base_type !13 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_float16
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          64
+; CHECK-NEXT:       Align:         64
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     F32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      float16
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_float16(<16 x float> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14
+    !kernel_arg_base_type !14 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_double16
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          128
+; CHECK-NEXT:       Align:         128
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     F64
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      double16
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_double16(<16 x double> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15
+    !kernel_arg_base_type !15 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_pointer
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16
+    !kernel_arg_base_type !16 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_image
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Image
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      image2d_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17
+    !kernel_arg_base_type !17 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_sampler
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     Sampler
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      sampler_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_sampler(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18
+    !kernel_arg_base_type !18 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_queue
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Queue
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      queue_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19
+    !kernel_arg_base_type !19 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_struct
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Private
+; CHECK-NEXT:       TypeName:      struct A
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_struct(%struct.A* byval %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20
+    !kernel_arg_base_type !20 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_i128
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          16
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      i128
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_i128(i128 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21
+    !kernel_arg_base_type !21 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_multi_arg
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I16
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      short2
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      char3
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c)
+    !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24
+    !kernel_arg_base_type !24 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:      - Name:            test_addr_space
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Constant
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       PointeeAlign:  4
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g,
+                                           i32 addrspace(2)* %c,
+                                           i32 addrspace(3)* %l)
+    !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51
+    !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:      - Name:            test_type_qual
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       IsVolatile:    true
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       IsConst:       true
+; CHECK-NEXT:       IsRestrict:    true
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Pipe
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       IsPipe:        true
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a,
+                                          i32 addrspace(1)* %b,
+                                          %opencl.pipe_t addrspace(1)* %c)
+    !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51
+    !kernel_arg_base_type !51 !kernel_arg_type_qual !70 {
+  ret void
+}
+
+; CHECK:      - Name:            test_access_qual
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Image
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       ReadOnly
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      image1d_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Image
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       WriteOnly
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      image2d_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Image
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       ReadWrite
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      image3d_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro,
+                                            %opencl.image2d_t addrspace(1)* %wo,
+                                            %opencl.image3d_t addrspace(1)* %rw)
+    !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62
+    !kernel_arg_base_type !62 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_half
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   half
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_half(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !26 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_float
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   float
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_float(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !27 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_double
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   double
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_double(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !28 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_char
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   char
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_char(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !29 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_short
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   short
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_short(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !30 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_long
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   long
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_long(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !31 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_unknown
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   unknown
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_unknown(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !32 {
+  ret void
+}
+
+; CHECK:      - Name:            test_reqd_wgs_vec_type_hint
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       ReqdWorkGroupSize: [ 1, 2, 4 ]
+; CHECK-NEXT:       VecTypeHint:       int
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:              4
+; CHECK-NEXT:       Align:             4
+; CHECK-NEXT:       ValueKind:         ByValue
+; CHECK-NEXT:       ValueType:         I32
+; CHECK-NEXT:       AccQual:           Default
+; CHECK-NEXT:       TypeName:          int
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:         I8
+; CHECK-NEXT:       AddrSpaceQual:     Global
+define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5
+    !reqd_work_group_size !6 {
+  ret void
+}
+
+; CHECK:      - Name:            test_wgs_hint_vec_type_hint
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       WorkGroupSizeHint: [ 8, 16, 32 ]
+; CHECK-NEXT:       VecTypeHint:       uint4
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:              4
+; CHECK-NEXT:       Align:             4
+; CHECK-NEXT:       ValueKind:         ByValue
+; CHECK-NEXT:       ValueType:         I32
+; CHECK-NEXT:       AccQual:           Default
+; CHECK-NEXT:       TypeName:          int
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:         I8
+; CHECK-NEXT:       AddrSpaceQual:     Global
+define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7
+    !work_group_size_hint !8 {
+  ret void
+}
+
+; CHECK:      - Name:            test_arg_ptr_to_ptr
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      'int **'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_arg_ptr_to_ptr(i32* addrspace(1)* %a)
+    !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80
+    !kernel_arg_base_type !80 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_arg_struct_contains_ptr
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Private
+; CHECK-NEXT:       TypeName:      struct B
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B* byval %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82
+    !kernel_arg_base_type !82 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK:      - Name:            test_arg_vector_of_ptr
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          16
+; CHECK-NEXT:       Align:         16
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      'global int* __attribute__((ext_vector_type(2)))'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83
+    !kernel_arg_base_type !83 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_arg_unknown_builtin_type
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      clk_event_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_arg_unknown_builtin_type(
+    %opencl.clk_event_t addrspace(1)* %a)
+    !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84
+    !kernel_arg_base_type !84 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_pointee_align
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      'long *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  1
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  2
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char2 *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  4
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char3 *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  4
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char4 *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char8 *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  16
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char16 *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
+                                              i8 addrspace(3)* %b,
+                                              <2 x i8> addrspace(3)* %c,
+                                              <3 x i8> addrspace(3)* %d,
+                                              <4 x i8> addrspace(3)* %e,
+                                              <8 x i8> addrspace(3)* %f,
+                                              <16 x i8> addrspace(3)* %g)
+    !kernel_arg_addr_space !91 !kernel_arg_access_qual !92 !kernel_arg_type !93
+    !kernel_arg_base_type !93 !kernel_arg_type_qual !94 {
+  ret void
+}
+
+!llvm.printf.fmts = !{!100, !101}
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"int"}
+!4 = !{!""}
+!5 = !{i32 undef, i32 1}
+!6 = !{i32 1, i32 2, i32 4}
+!7 = !{<4 x i32> undef, i32 0}
+!8 = !{i32 8, i32 16, i32 32}
+!9 = !{!"char"}
+!10 = !{!"ushort2"}
+!11 = !{!"int3"}
+!12 = !{!"ulong4"}
+!13 = !{!"half8"}
+!14 = !{!"float16"}
+!15 = !{!"double16"}
+!16 = !{!"int *"}
+!17 = !{!"image2d_t"}
+!18 = !{!"sampler_t"}
+!19 = !{!"queue_t"}
+!20 = !{!"struct A"}
+!21 = !{!"i128"}
+!22 = !{i32 0, i32 0, i32 0}
+!23 = !{!"none", !"none", !"none"}
+!24 = !{!"int", !"short2", !"char3"}
+!25 = !{!"", !"", !""}
+!26 = !{half undef, i32 1}
+!27 = !{float undef, i32 1}
+!28 = !{double undef, i32 1}
+!29 = !{i8 undef, i32 1}
+!30 = !{i16 undef, i32 1}
+!31 = !{i64 undef, i32 1}
+!32 = !{i32 *undef, i32 1}
+!50 = !{i32 1, i32 2, i32 3}
+!51 = !{!"int *", !"int *", !"int *"}
+!60 = !{i32 1, i32 1, i32 1}
+!61 = !{!"read_only", !"write_only", !"read_write"}
+!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"}
+!70 = !{!"volatile", !"const restrict", !"pipe"}
+!80 = !{!"int **"}
+!81 = !{i32 1}
+!82 = !{!"struct B"}
+!83 = !{!"global int* __attribute__((ext_vector_type(2)))"}
+!84 = !{!"clk_event_t"}
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3}
+!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"}
+!93 = !{!"long *", !"char *", !"char2 *", !"char3 *", !"char4 *", !"char8 *", !"char16 *"}
+!94 = !{!"", !"", !"", !"", !"", !"", !""}
+!100 = !{!"1:1:4:%d\5Cn"}
+!101 = !{!"2:1:8:%g\5Cn"}
+
+; NOTES: Displaying notes found at file offset 0x{{[0-9]+}}
+; NOTES-NEXT: Owner    Data size    Description
+; NOTES-NEXT: AMD      0x00000008   Unknown note type: (0x00000001)
+; NOTES-NEXT: AMD      0x0000001b   Unknown note type: (0x00000003)
+; GFX700:     AMD      0x00009171   Unknown note type: (0x0000000a)
+; GFX800:     AMD      0x00009190   Unknown note type: (0x0000000a)
+; GFX900:     AMD      0x00009171   Unknown note type: (0x0000000a)
+
+; PARSER: AMDGPU Code Object Metadata Parser Test: PASS
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll
new file mode 100644
index 000000000000..f41da9f92136
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: Version: [ 1, 0 ]
+; CHECK: ...
+
+!opencl.ocl.version = !{}
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll
new file mode 100644
index 000000000000..0509663d9849
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: Version: [ 1, 0 ]
+; CHECK: ...
+
+!opencl.ocl.version = !{!0}
+!0 = !{}
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll
new file mode 100644
index 000000000000..7404cec5d78a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: Version: [ 1, 0 ]
+; CHECK: ...
+
+!opencl.ocl.version = !{!0}
+!0 = !{i32 1}
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll b/test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll
new file mode 100644
index 000000000000..3b232e40cf25
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+; CHECK: ---
+; CHECK:  Version: [ 1, 0 ]
+
+; CHECK:  Kernels:
+; CHECK:    - Name: test
+; CHECK:      CodeProps:
+; CHECK:        KernargSegmentSize:  24
+; GFX700:       WavefrontNumSGPRs:   6
+; GFX800:       WavefrontNumSGPRs:   96
+; GFX900:       WavefrontNumSGPRs:   6
+; GFX700:       WorkitemNumVGPRs:    4
+; GFX800:       WorkitemNumVGPRs:    6
+; GFX900:       WorkitemNumVGPRs:    6
+; CHECK:        KernargSegmentAlign: 4
+; CHECK:        GroupSegmentAlign:   4
+; CHECK:        PrivateSegmentAlign: 4
+; CHECK:        WavefrontSize:       6
+define amdgpu_kernel void @test(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
new file mode 100644
index 000000000000..801029be8cb9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+; CHECK: ---
+; CHECK:  Version: [ 1, 0 ]
+
+; CHECK:  Kernels:
+; CHECK:    - Name: test
+; CHECK:      DebugProps:
+; CHECK:        DebuggerABIVersion:                [ 1, 0 ]
+; CHECK:        ReservedNumVGPRs:                  4
+; CHECK:        ReservedFirstVGPR:                 11
+; CHECK:        PrivateSegmentBufferSGPR:          0
+; CHECK:        WavefrontPrivateSegmentOffsetSGPR: 11
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !7 !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !15 {
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !16, metadata !17), !dbg !18
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !19
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 0, !dbg !19
+  store i32 777, i32 addrspace(1)* %arrayidx, align 4, !dbg !20
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !21
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 1, !dbg !21
+  store i32 888, i32 addrspace(1)* %arrayidx1, align 4, !dbg !22
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !23
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 2, !dbg !23
+  store i32 999, i32 addrspace(1)* %arrayidx2, align 4, !dbg !24
+  ret void, !dbg !25
+}
+
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!opencl.ocl.version = !{!3}
+!llvm.module.flags = !{!4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "code-object-metadata-kernel-debug-props.cl", directory: "/some/random/directory")
+!2 = !{}
+!3 = !{i32 1, i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 2}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{!"clang version 5.0.0"}
+!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{i32 1}
+!13 = !{!"none"}
+!14 = !{!"int*"}
+!15 = !{!""}
+!16 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!17 = !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)
+!18 = !DILocation(line: 1, column: 30, scope: !7)
+!19 = !DILocation(line: 2, column: 3, scope: !7)
+!20 = !DILocation(line: 2, column: 8, scope: !7)
+!21 = !DILocation(line: 3, column: 3, scope: !7)
+!22 = !DILocation(line: 3, column: 8, scope: !7)
+!23 = !DILocation(line: 4, column: 3, scope: !7)
+!24 = !DILocation(line: 4, column: 8, scope: !7)
+!25 = !DILocation(line: 5, column: 1, scope: !7)
diff --git a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
index 585172092676..155de5353bcb 100644
--- a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
@@ -8,7 +8,7 @@
 ; SI-LLC-LABEL: {{^}}test:
 ; SI-LLC: s_mul_i32
 ; SI-LLC-NOT: mul
-define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
 entry:
   %0 = mul nsw i32 %a, 3
   %1 = sext i32 %0 to i64
diff --git a/test/CodeGen/AMDGPU/combine_vloads.ll b/test/CodeGen/AMDGPU/combine_vloads.ll
index 01572afa6205..f8d4e01085c2 100644
--- a/test/CodeGen/AMDGPU/combine_vloads.ll
+++ b/test/CodeGen/AMDGPU/combine_vloads.ll
@@ -12,7 +12,7 @@
 ; EG-LABEL: {{^}}combine_vloads:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+define amdgpu_kernel void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
 entry:
   br label %for.body
 
diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll
index a4c51b233f41..973c4544d97a 100644
--- a/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/test/CodeGen/AMDGPU/commute-compares.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}commute_eq_64_i32:
 ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -21,7 +21,7 @@ define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_ne_64_i32:
 ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -36,7 +36,7 @@ define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 ; GCN-LABEL: {{^}}commute_ne_litk_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -49,7 +49,7 @@ define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -62,7 +62,7 @@ define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_uge_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
-define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -75,7 +75,7 @@ define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ult_64_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -88,7 +88,7 @@ define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ule_63_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -104,7 +104,7 @@ define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_64_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -117,7 +117,7 @@ define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
-define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@ define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
-define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -143,7 +143,7 @@ define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
-define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -156,7 +156,7 @@ define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_sle_5_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
-define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -173,7 +173,7 @@ define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_eq_64_i64:
 ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -186,7 +186,7 @@ define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_ne_64_i64:
 ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -199,7 +199,7 @@ define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -212,7 +212,7 @@ define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_uge_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -225,7 +225,7 @@ define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ult_64_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -238,7 +238,7 @@ define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ule_63_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -254,7 +254,7 @@ define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -267,7 +267,7 @@ define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -280,7 +280,7 @@ define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -293,7 +293,7 @@ define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -306,7 +306,7 @@ define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_sle_5_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -324,7 +324,7 @@ define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -338,7 +338,7 @@ define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -351,7 +351,7 @@ define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -364,7 +364,7 @@ define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -377,7 +377,7 @@ define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -390,7 +390,7 @@ define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -403,7 +403,7 @@ define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
 ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -416,7 +416,7 @@ define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -429,7 +429,7 @@ define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -442,7 +442,7 @@ define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -455,7 +455,7 @@ define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -468,7 +468,7 @@ define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -481,7 +481,7 @@ define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -494,7 +494,7 @@ define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
 ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -512,7 +512,7 @@ define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
 ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -526,7 +526,7 @@ define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
 ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -539,7 +539,7 @@ define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
 ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -552,7 +552,7 @@ define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
 ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -565,7 +565,7 @@ define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
 ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -578,7 +578,7 @@ define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
 ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -591,7 +591,7 @@ define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
 ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -604,7 +604,7 @@ define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
 ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -617,7 +617,7 @@ define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
 ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -630,7 +630,7 @@ define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
 ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -643,7 +643,7 @@ define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
 ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -656,7 +656,7 @@ define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
 ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -669,7 +669,7 @@ define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
 ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -682,7 +682,7 @@ define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
 ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -701,9 +701,9 @@ define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_frameindex:
 ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
 
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
-define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %stack0 = alloca i32
   %ptr0 = load volatile i32*, i32* addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/commute-shifts.ll b/test/CodeGen/AMDGPU/commute-shifts.ll
index 862f236514ca..84d8bf2bd706 100644
--- a/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -4,10 +4,10 @@
 ; GCN-LABEL: {{^}}main:
 ; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1
-define amdgpu_ps void @main(float %arg0, float %arg1) #0 {
+define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
 bb:
   %tmp = fptosi float %arg0 to i32
-  %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp1 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false)
   %tmp2.f = extractelement <4 x float> %tmp1, i32 0
   %tmp2 = bitcast float %tmp2.f to i32
   %tmp3 = and i32 %tmp, 7
@@ -15,15 +15,14 @@ bb:
   %tmp5 = and i32 %tmp2, %tmp4
   %tmp6 = icmp eq i32 %tmp5, 0
   %tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1
-  %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7)
-  %tmp9 = bitcast i32 %tmp8 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp9, float undef, float %tmp9)
-  ret void
+  %tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7)
+  %tmp9 = bitcast <2 x half> %tmp8 to float
+  ret float %tmp9
 }
 
-declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare i32 @llvm.SI.packf16(float, float) #1
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll
index ed4ec82eb3e3..8820e4fd80e5 100644
--- a/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -8,7 +8,7 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, 2.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -22,7 +22,7 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -4.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -37,7 +37,7 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -53,7 +53,7 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -68,7 +68,7 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -85,7 +85,7 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -102,7 +102,7 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -121,7 +121,7 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -139,7 +139,7 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -161,7 +161,7 @@ define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
-define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/concat_vectors.ll b/test/CodeGen/AMDGPU/concat_vectors.ll
index 2e6be5d10f09..7394842d156f 100644
--- a/test/CodeGen/AMDGPU/concat_vectors.ll
+++ b/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -8,7 +8,7 @@
 ; value if we want to ensure scratch memory is not being used.
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -17,7 +17,7 @@ define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x
 ; FUNC-LABEL: {{^}}test_concat_v2i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -26,7 +26,7 @@ define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x
 ; FUNC-LABEL: {{^}}test_concat_v4i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
   ret void
@@ -35,7 +35,7 @@ define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x
 ; FUNC-LABEL: {{^}}test_concat_v8i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
   %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
   ret void
@@ -44,7 +44,7 @@ define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x
 ; FUNC-LABEL: {{^}}test_concat_v16i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
   %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
   ret void
@@ -53,7 +53,7 @@ define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <1
 ; FUNC-LABEL: {{^}}test_concat_v1f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
   %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
   store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -62,7 +62,7 @@ define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v2f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -71,7 +71,7 @@ define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v4f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -80,7 +80,7 @@ define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v8f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -89,7 +89,7 @@ define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a,
 ; FUNC-LABEL: {{^}}test_concat_v16f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
   ret void
@@ -98,7 +98,7 @@ define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a
 ; FUNC-LABEL: {{^}}test_concat_v1i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -107,7 +107,7 @@ define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v2i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -116,7 +116,7 @@ define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v4i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -125,7 +125,7 @@ define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v8i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -134,7 +134,7 @@ define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a
 ; FUNC-LABEL: {{^}}test_concat_v16i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -143,7 +143,7 @@ define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double>
 ; FUNC-LABEL: {{^}}test_concat_v1f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -152,7 +152,7 @@ define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v2f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -161,7 +161,7 @@ define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v4f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -170,7 +170,7 @@ define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v8f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -179,7 +179,7 @@ define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a
 ; FUNC-LABEL: {{^}}test_concat_v16f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -188,7 +188,7 @@ define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double>
 ; FUNC-LABEL: {{^}}test_concat_v1i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
   %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
   ret void
@@ -197,7 +197,7 @@ define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v2i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
   %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
   ret void
@@ -206,7 +206,7 @@ define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v4i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
   %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
   ret void
@@ -215,7 +215,7 @@ define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v8i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
   %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@ define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1
 ; FUNC-LABEL: {{^}}test_concat_v16i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
   %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
   ret void
@@ -233,7 +233,7 @@ define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x
 ; FUNC-LABEL: {{^}}test_concat_v32i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
   %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
   store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
   ret void
@@ -242,7 +242,7 @@ define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x
 ; FUNC-LABEL: {{^}}test_concat_v1i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
   %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
   ret void
@@ -251,7 +251,7 @@ define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x
 ; FUNC-LABEL: {{^}}test_concat_v2i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
   %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
   ret void
@@ -260,7 +260,7 @@ define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x
 ; FUNC-LABEL: {{^}}test_concat_v4i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
   %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
   ret void
@@ -269,7 +269,7 @@ define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x
 ; FUNC-LABEL: {{^}}test_concat_v8i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
   %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
   ret void
@@ -278,7 +278,7 @@ define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x
 ; FUNC-LABEL: {{^}}test_concat_v16i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
   %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
   ret void
@@ -286,7 +286,7 @@ define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <1
 
 ; FUNC-LABEL: {{^}}concat_vector_crash:
 ; SI: s_endpgm
-define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 bb:
   %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
   %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index 34bb2588ad62..62b47beb1251 100644
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -1,12 +1,12 @@
 # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
 --- |
-  define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %and = and i32 %a, 1234567
     store volatile i32 %and, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -17,13 +17,13 @@
     ret void
   }
 
-  define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %shl = shl i32 %a, 12
     store volatile i32 %shl, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -34,13 +34,13 @@
     ret void
   }
 
-  define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %ashr = ashr i32 %a, 12
     store volatile i32 %ashr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -51,13 +51,13 @@
     ret void
   }
 
-   define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+   define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %lshr = lshr i32 %a, 12
     store volatile i32 %lshr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 0ff75ab58003..0831d250b9e7 100644
--- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -5,7 +5,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
@@ -17,7 +17,7 @@ define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
   store i32 %and, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
@@ -42,7 +42,7 @@ define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
   store i32 %or, i32 addrspace(1)* %out
@@ -53,7 +53,7 @@ define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
@@ -67,7 +67,7 @@ define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
   store i32 %xor, i32 addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, -1
   store i32 %xor, i32 addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
 ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
 ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
   %vreg = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg)
   %xor = xor i64 %ctpop, -1
@@ -110,7 +110,7 @@ define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
@@ -126,7 +126,7 @@ define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
 ; GCN: v_not_b32
 ; GCN: v_and_b32
 ; GCN-NOT: v_and_b32
-define void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 13383cbc1741..d3e6c11ef908 100644
--- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -10,6 +10,8 @@
 
 
 ; GCN-LABEL: {{^}}divergent_if_endif:
+; VGPR: workitem_private_segment_byte_size = 12{{$}}
+
 
 ; GCN: {{^}}; BB#0:
 ; GCN: s_mov_b32 m0, -1
@@ -26,12 +28,13 @@
 
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:8 ; 4-byte Folded Spill
 
 ; Spill load
 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
 ; GCN: s_waitcnt vmcnt(0) expcnt(0)
@@ -55,11 +58,11 @@
 
 
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:8 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
@@ -69,7 +72,7 @@
 ; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-define void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -88,6 +91,8 @@ endif:
 }
 
 ; GCN-LABEL: {{^}}divergent_loop:
+; VGPR: workitem_private_segment_byte_size = 16{{$}}
+
 ; GCN: {{^}}; BB#0:
 
 ; GCN: s_mov_b32 m0, -1
@@ -100,7 +105,7 @@ endif:
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -108,9 +113,9 @@ endif:
 
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -120,7 +125,7 @@ endif:
 
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
-; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
 ; GCN: v_cmp_ne_u32_e32 vcc,
 ; GCN: s_and_b64 vcc, exec, vcc
@@ -133,11 +138,11 @@ endif:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
@@ -145,7 +150,7 @@ endif:
 ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-define void @divergent_loop(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -180,7 +185,7 @@ end:
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -237,14 +242,14 @@ end:
 
 ; GCN: BB{{[0-9]+}}_2: ; %if
 ; GCN: ds_read_b32
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
 ; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ELSE]]: ; %else
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; GCN: s_waitcnt vmcnt(0) expcnt(0)
@@ -267,7 +272,7 @@ end:
 
 ; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
-define void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
diff --git a/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
index 755f439c6863..0074a41e44cf 100644
--- a/test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_cmp_ne_u32_e64
 ; GCN: ; mask branch
 ; GCN: BB{{[0-9]+_[0-9]+}}:
-define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) #1
@@ -29,7 +29,8 @@ bb5:                                              ; preds = %bb3, %bb
 ; GCN: v_cmp_ne_u32_e64
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
-define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+
+define amdgpu_kernel void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 7434d745b259..026dd7ca6c87 100644
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@ define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)*
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -32,7 +32,7 @@ define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -47,7 +47,7 @@ define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -65,7 +65,7 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN-DAG: buffer_store_dword
 
 ; GCN: s_endpgm
-define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
@@ -85,7 +85,7 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -101,7 +101,7 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8>
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -113,7 +113,7 @@ define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
   ret void
@@ -128,7 +128,7 @@ define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
   ret void
@@ -141,7 +141,7 @@ define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8>
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/copy-to-reg.ll b/test/CodeGen/AMDGPU/copy-to-reg.ll
index 3422a889a520..f35b0706f3d3 100644
--- a/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -6,7 +6,7 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
-define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %alloca = alloca [16 x i32]
   br label %loop
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll
index 1a0027dd4a3c..e252971e3f42 100644
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -27,7 +27,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -43,7 +43,7 @@ define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -61,7 +61,7 @@ define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia
 ; EG: CNDE_INT
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -89,7 +89,7 @@ define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp
 
 ; EG-DAG: FFBH_UINT
 ; EG-DAG: CNDE_INT
-define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -98,10 +98,11 @@ define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrsp
 
 ; FUNC-LABEL: {{^}}v_ctlz_i8:
 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
-; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GCN: buffer_store_byte [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -119,14 +120,14 @@ define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_i64_trunc:
-define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -145,7 +146,7 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
 ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -156,7 +157,7 @@ define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i64_trunc:
-define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -172,7 +173,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
- define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -186,7 +187,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -202,7 +203,7 @@ define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspac
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %ctlz, 32
@@ -217,7 +218,7 @@ define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %ctlz, 32
@@ -230,7 +231,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
- define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -245,7 +246,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; SI: buffer_load_ushort [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_short [[FFBH]],
- define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
   %val = load i16, i16 addrspace(1)* %valptr
   %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
   %cmp = icmp eq i16 %val, 0
@@ -260,7 +261,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
 ; GCN: {{buffer|flat}}_store_byte [[TRUNC]],
-define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
   %val = load i7, i7 addrspace(1)* %valptr.gep
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index d390f64deeab..87ba563a740f 100644
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -22,7 +22,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -51,7 +51,7 @@ define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -71,7 +71,7 @@ define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -82,7 +82,7 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x
 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
-define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -100,14 +100,14 @@ define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc:
-define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -123,7 +123,7 @@ define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %va
 ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -134,7 +134,7 @@ define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc:
-define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -175,7 +175,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
-define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -194,7 +194,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN-DAG: buffer_store_dword [[RESULT0]]
 ; GCN-DAG: buffer_store_byte [[RESULT1]]
 ; GCN: s_endpgm
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -211,7 +211,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -227,7 +227,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -243,7 +243,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 1
@@ -259,7 +259,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 1
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index 9692236bb363..a29e72ea57cb 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -16,7 +16,7 @@ declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
   ret void
@@ -30,7 +30,7 @@ define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
@@ -48,7 +48,7 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %val1 = load i32, i32 addrspace(1)* %in1, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
@@ -64,7 +64,7 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %add = add i32 %ctpop0, %sval
@@ -79,7 +79,7 @@ define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -97,7 +97,7 @@ define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -123,7 +123,7 @@ define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
@@ -165,7 +165,7 @@ define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
   %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
   store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
@@ -179,7 +179,7 @@ define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> ad
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 4
@@ -194,7 +194,7 @@ define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 4, %ctpop
@@ -209,7 +209,7 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 99999
@@ -225,7 +225,7 @@ define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspa
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, %const
@@ -241,7 +241,7 @@ define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %const, %ctpop
@@ -258,7 +258,7 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
@@ -279,7 +279,7 @@ define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrsp
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 ; EG: BCNT_INT
-define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index cd5d805e5db3..2610684ad9ee 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -17,7 +17,7 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -31,7 +31,7 @@ define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
 ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
@@ -48,7 +48,7 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
+define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %or = or i64 %ctpop, %s.val
@@ -60,7 +60,7 @@ define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
   store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -73,7 +73,7 @@ define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val)
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
   store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@ define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val)
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -104,7 +104,7 @@ define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrs
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -121,7 +121,7 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
 ; GCN: s_endpgm
-define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
@@ -146,7 +146,7 @@ endif:
 ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
 ; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]]
 ; GCN: s_endpgm
-define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -159,7 +159,7 @@ define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
 ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
 ; GCN: s_endpgm
-define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
   %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
   %truncctpop = trunc i65 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -181,7 +181,7 @@ define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
 
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
   %val = load i128, i128 addrspace(1)* %in, align 8
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index e33cc18eb05f..1fa6407647eb 100644
--- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -14,7 +14,7 @@ declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
@@ -43,7 +43,7 @@ define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
@@ -63,7 +63,7 @@ define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/cube.ll b/test/CodeGen/AMDGPU/cube.ll
index 9b512c439b0e..7b5f1aff7ea6 100644
--- a/test/CodeGen/AMDGPU/cube.ll
+++ b/test/CodeGen/AMDGPU/cube.ll
@@ -6,16 +6,13 @@ declare float @llvm.amdgcn.cubesc(float, float, float) #0
 declare float @llvm.amdgcn.cubetc(float, float, float) #0
 declare float @llvm.amdgcn.cubema(float, float, float) #0
 
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
-
-
 ; GCN-LABEL: {{^}}cube:
 ; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: _store_dwordx4
-define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
@@ -29,18 +26,5 @@ define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c)
   ret void
 }
 
-; GCN-LABEL: {{^}}legacy_cube:
-; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; GCN: _store_dwordx4
-define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 {
-  %cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx)
-  store <4 x float> %cube, <4 x float> addrspace(1)* %out
-  ret void
-}
-
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
-
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 7baaa81fba59..e16daa6fad9d 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 ; GCN-NOT: lshr
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %cvt = uitofp i8 %load to float
   store float %cvt, float addrspace(1)* %out, align 4
@@ -22,7 +22,7 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
   %cvt = uitofp <2 x i8> %load to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
@@ -36,7 +36,7 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <3 x i8> %load to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
@@ -52,7 +52,7 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -76,7 +76,7 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
 
 ; GCN: buffer_store_dwordx4
-define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -110,7 +110,7 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out
 ; GCN: {{buffer|flat}}_store_dword
 
 ; GCN: s_endpgm
-define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -124,7 +124,7 @@ define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <
 ; Make sure this doesn't crash.
 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
 ; GCN: s_endpgm
-define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <7 x i8> %load to <7 x float>
   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
@@ -147,7 +147,7 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
 ; GCN-NOT: lshr
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
@@ -159,7 +159,7 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8>
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 2
   %inreg = and i32 %add, 255
@@ -169,7 +169,7 @@ define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addr
 }
 
 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
-define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %inreg = and i32 %load, 65280
   %shr = lshr i32 %inreg, 8
@@ -181,7 +181,7 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr
 ; We don't get these ones because of the zext, but instcombine removes
 ; them so it shouldn't really matter.
 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
-define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %ext = zext i8 %load to i32
   %cvt = uitofp i32 %ext to float
@@ -190,7 +190,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1
 }
 
 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
-define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %ext = zext <4 x i8> %load to <4 x i32>
   %cvt = uitofp <4 x i32> %ext to <4 x float>
@@ -203,7 +203,7 @@ define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %and = and i32 %val, 255
   %cvt = uitofp i32 %and to float
@@ -216,7 +216,7 @@ define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 8
   %and = and i32 %srl, 255
@@ -230,7 +230,7 @@ define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 16
   %and = and i32 %srl, 255
@@ -244,7 +244,7 @@ define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 24
   %and = and i32 %srl, 255
diff --git a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
index e7773c6e2a4f..c10cf1a8a6f2 100644
--- a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
+++ b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
@@ -10,7 +10,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-NOT: add
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptosi float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out
@@ -22,7 +22,7 @@ define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 1.0
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -35,7 +35,7 @@ define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %floor = call float @llvm.floor.f32(float %x.fabs) #1
   %cvt = fptosi float %floor to i32
@@ -48,7 +48,7 @@ define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %floor = call float @llvm.floor.f32(float %x.fneg) #1
   %cvt = fptosi float %floor to i32
@@ -61,7 +61,7 @@ define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
@@ -75,7 +75,7 @@ define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32_e32
 ; SI: s_endpgm
-define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptoui float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
index d38411dcca61..9b771ebdf7b3 100644
--- a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
+++ b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@@ -9,7 +9,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -21,7 +21,7 @@ define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %fadd = fadd float %x.fabs, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -37,7 +37,7 @@ define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %fadd = fadd float %x.fneg, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -55,7 +55,7 @@ define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %fadd = fadd float %x.fabs.fneg, 0.5
@@ -71,7 +71,7 @@ define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32
 ; SI: s_endpgm
-define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptoui float %floor to i32
diff --git a/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
index a32c16dfac38..11acbc274eb5 100644
--- a/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
+++ b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -9,7 +9,7 @@
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 
-define void @store_same_base_ptr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_same_base_ptr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #0
   %offset = sext i32 %id to i64
diff --git a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
index fb43ff4fbddd..ceff889b3a7e 100644
--- a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -10,7 +10,7 @@
 ; CHECK: {{^}}sint:
 ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %sint = load i32, i32 addrspace(1) * %in
@@ -24,7 +24,7 @@ entry:
 ;CHECK: {{^}}uint:
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %uint = load i32, i32 addrspace(1) * %in
diff --git a/test/CodeGen/AMDGPU/debug.ll b/test/CodeGen/AMDGPU/debug.ll
index a2e0e878b740..f149aaddb8ef 100644
--- a/test/CodeGen/AMDGPU/debug.ll
+++ b/test/CodeGen/AMDGPU/debug.ll
@@ -4,7 +4,7 @@
 ; Test for a crash in the custom assembly dump code.
 
 ; SI: s_endpgm
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
index 49a7e722f29c..734905ba2b08 100644
--- a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
+++ b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -23,7 +23,7 @@
 ; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/test/CodeGen/AMDGPU/debugger-insert-nops.ll b/test/CodeGen/AMDGPU/debugger-insert-nops.ll
index 6638f4e25821..fcdbfb10a8ca 100644
--- a/test/CodeGen/AMDGPU/debugger-insert-nops.ll
+++ b/test/CodeGen/AMDGPU/debugger-insert-nops.ll
@@ -1,27 +1,35 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECKNOP
 
-; CHECK: test01.cl:2:{{[0-9]+}}
-; CHECK-NEXT: s_nop 0
+; This test expects that we have one instance for each line in some order with "s_nop 0" instances after each.
 
-; CHECK: test01.cl:3:{{[0-9]+}}
-; CHECK-NEXT: s_nop 0
+; Check that each line appears at least once
+; CHECK-DAG: test01.cl:2:3
+; CHECK-DAG: test01.cl:3:3
+; CHECK-DAG: test01.cl:4:3
 
-; CHECK: test01.cl:4:{{[0-9]+}}
-; CHECK-NEXT: s_nop 0
+
+; Check that each of each of the lines consists of the line output, followed by "s_nop 0"
+; CHECKNOP: test01.cl:{{[234]}}:3
+; CHECKNOP-NEXT: s_nop 0
+; CHECKNOP: test01.cl:{{[234]}}:3
+; CHECKNOP-NEXT: s_nop 0
+; CHECKNOP: test01.cl:{{[234]}}:3
+; CHECKNOP-NEXT: s_nop 0
 
 ; CHECK: test01.cl:5:{{[0-9]+}}
 ; CHECK-NEXT: s_nop 0
 ; CHECK-NEXT: s_endpgm
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
   %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
-  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !20
   %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
   store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
diff --git a/test/CodeGen/AMDGPU/debugger-reserve-regs.ll b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
index d30bb20bb03a..764c60b12bf9 100644
--- a/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
+++ b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
@@ -1,11 +1,12 @@
 ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s
 ; CHECK: reserved_vgpr_first = {{[0-9]+}}
 ; CHECK-NEXT: reserved_vgpr_count = 4
 ; CHECK: ReservedVGPRFirst: {{[0-9]+}}
 ; CHECK-NEXT: ReservedVGPRCount: 4
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll
index 28d065e3b32b..ad9111a28654 100644
--- a/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}test_default_si:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+define amdgpu_kernel void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -12,7 +12,7 @@ define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN-LABEL: {{^}}test_default_vi:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -21,7 +21,7 @@ define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN-LABEL: {{^}}test_f64_denormals:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -30,7 +30,7 @@ define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN-LABEL: {{^}}test_f32_denormals:
 ; GCNL: FloatMode: 48
 ; GCN: IeeeMode: 1
-define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -39,7 +39,7 @@ define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN-LABEL: {{^}}test_f32_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -48,12 +48,40 @@ define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(
 ; GCN-LABEL: {{^}}test_no_denormals
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
 }
 
+; GCN-LABEL: {{^}}test_f16_f64_denormals:
+; GCN: FloatMode: 192
+; GCN: IeeeMode: 1
+define amdgpu_kernel void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
+  store half 0.0, half addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_f16_f64_denormals:
+; GCN: FloatMode: 0
+; GCN: IeeeMode: 1
+define amdgpu_kernel void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
+  store half 0.0, half addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_f32_f16_f64_denormals:
+; GCN: FloatMode: 240
+; GCN: IeeeMode: 1
+define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
+  store half 0.0, half addrspace(1)* %out0
+  store float 0.0, float addrspace(1)* %out1
+  store double 0.0, double addrspace(1)* %out2
+  ret void
+}
+
 ; GCN-LABEL: {{^}}kill_gs_const:
 ; GCN: IeeeMode: 0
 define amdgpu_gs void @kill_gs_const() {
@@ -69,22 +97,22 @@ main_body:
 
 ; GCN-LABEL: {{^}}kill_vcc_implicit_def:
 ; GCN: IeeeMode: 0
-define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
+define amdgpu_ps float @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
 entry:
   %tmp0 = fcmp olt float %13, 0.0
   call void @llvm.AMDGPU.kill(float %14)
   %tmp1 = select i1 %tmp0, float 1.0, float 0.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
-  ret void
+  ret float %tmp1
 }
 
-
 declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
 attributes #0 = { nounwind "target-cpu"="tahiti" }
 attributes #1 = { nounwind "target-cpu"="fiji" }
 attributes #2 = { nounwind "target-features"="+fp64-denormals" }
 attributes #3 = { nounwind "target-features"="+fp32-denormals" }
 attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" }
-attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" }
+attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
+attributes #6 = { nounwind "target-features"="+fp64-fp16-denormals" }
+attributes #7 = { nounwind "target-features"="-fp64-fp16-denormals" }
+attributes #8 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
index 9d70f67ef491..32e6f7cc0cdc 100644
--- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir
+++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -1,14 +1,14 @@
 # RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
-  define void @test2() { ret void }
-  define void @test3() { ret void }
-  define void @test4() { ret void }
-  define void @test5() { ret void }
-  define void @loop0() { ret void }
-  define void @loop1() { ret void }
-  define void @loop2() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
+  define amdgpu_kernel void @test2() { ret void }
+  define amdgpu_kernel void @test3() { ret void }
+  define amdgpu_kernel void @test4() { ret void }
+  define amdgpu_kernel void @test5() { ret void }
+  define amdgpu_kernel void @loop0() { ret void }
+  define amdgpu_kernel void @loop1() { ret void }
+  define amdgpu_kernel void @loop2() { ret void }
 ...
 ---
 # Combined use/def transfer check, the basics.
diff --git a/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
index cdd2c0cd4f43..6dfe1294bb47 100644
--- a/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
+++ b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
@@ -9,7 +9,7 @@
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK-NEXT: JUMP
 ; CHECK-NEXT: LOOP_BREAK
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
+define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
diff --git a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index 5e1ebfde3e10..878b5ebe9409 100644
--- a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -9,7 +9,7 @@
 ; GCN: buffer_load_dword
 ; GCN: ds_write2_b32
 ; GCN: s_endpgm
-define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
+define amdgpu_kernel void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
 entry:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx = shl i32 %tid, 2
diff --git a/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
new file mode 100644
index 000000000000..a723b0210ade
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
@@ -0,0 +1,412 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+
+; GCN-LABEL: ds_read32_combine_stride_400:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
+; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
+; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
+; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
+define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
+bb:
+  %tmp = load float, float addrspace(3)* %arg, align 4
+  %tmp2 = fadd float %tmp, 0.000000e+00
+  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
+  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
+  %tmp5 = fadd float %tmp2, %tmp4
+  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
+  %tmp7 = load float, float addrspace(3)* %tmp6, align 4
+  %tmp8 = fadd float %tmp5, %tmp7
+  %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
+  %tmp10 = load float, float addrspace(3)* %tmp9, align 4
+  %tmp11 = fadd float %tmp8, %tmp10
+  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
+  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp14 = fadd float %tmp11, %tmp13
+  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
+  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
+  %tmp17 = fadd float %tmp14, %tmp16
+  %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
+  %tmp19 = load float, float addrspace(3)* %tmp18, align 4
+  %tmp20 = fadd float %tmp17, %tmp19
+  %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
+  %tmp22 = load float, float addrspace(3)* %tmp21, align 4
+  %tmp23 = fadd float %tmp20, %tmp22
+  store float %tmp23, float *%arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: ds_read32_combine_stride_400_back:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
+; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
+; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
+; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
+define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
+bb:
+  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
+  %tmp2 = load float, float addrspace(3)* %tmp, align 4
+  %tmp3 = fadd float %tmp2, 0.000000e+00
+  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
+  %tmp5 = load float, float addrspace(3)* %tmp4, align 4
+  %tmp6 = fadd float %tmp3, %tmp5
+  %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
+  %tmp8 = load float, float addrspace(3)* %tmp7, align 4
+  %tmp9 = fadd float %tmp6, %tmp8
+  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
+  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
+  %tmp12 = fadd float %tmp9, %tmp11
+  %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
+  %tmp14 = load float, float addrspace(3)* %tmp13, align 4
+  %tmp15 = fadd float %tmp12, %tmp14
+  %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
+  %tmp17 = load float, float addrspace(3)* %tmp16, align 4
+  %tmp18 = fadd float %tmp15, %tmp17
+  %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
+  %tmp20 = load float, float addrspace(3)* %tmp19, align 4
+  %tmp21 = fadd float %tmp18, %tmp20
+  %tmp22 = load float, float addrspace(3)* %arg, align 4
+  %tmp23 = fadd float %tmp21, %tmp22
+  store float %tmp23, float *%arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: ds_read32_combine_stride_8192:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32
+; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96
+; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160
+; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224
+define amdgpu_kernel void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
+bb:
+  %tmp = load float, float addrspace(3)* %arg, align 4
+  %tmp2 = fadd float %tmp, 0.000000e+00
+  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
+  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
+  %tmp5 = fadd float %tmp2, %tmp4
+  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
+  %tmp7 = load float, float addrspace(3)* %tmp6, align 4
+  %tmp8 = fadd float %tmp5, %tmp7
+  %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
+  %tmp10 = load float, float addrspace(3)* %tmp9, align 4
+  %tmp11 = fadd float %tmp8, %tmp10
+  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
+  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp14 = fadd float %tmp11, %tmp13
+  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
+  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
+  %tmp17 = fadd float %tmp14, %tmp16
+  %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
+  %tmp19 = load float, float addrspace(3)* %tmp18, align 4
+  %tmp20 = fadd float %tmp17, %tmp19
+  %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
+  %tmp22 = load float, float addrspace(3)* %tmp21, align 4
+  %tmp23 = fadd float %tmp20, %tmp22
+  store float %tmp23, float *%arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: ds_read32_combine_stride_8192_shifted:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
+; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32
+; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32
+; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32
+define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
+bb:
+  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
+  %tmp2 = load float, float addrspace(3)* %tmp, align 4
+  %tmp3 = fadd float %tmp2, 0.000000e+00
+  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2050
+  %tmp5 = load float, float addrspace(3)* %tmp4, align 4
+  %tmp6 = fadd float %tmp3, %tmp5
+  %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4098
+  %tmp8 = load float, float addrspace(3)* %tmp7, align 4
+  %tmp9 = fadd float %tmp6, %tmp8
+  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6146
+  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
+  %tmp12 = fadd float %tmp9, %tmp11
+  %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8194
+  %tmp14 = load float, float addrspace(3)* %tmp13, align 4
+  %tmp15 = fadd float %tmp12, %tmp14
+  %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10242
+  %tmp17 = load float, float addrspace(3)* %tmp16, align 4
+  %tmp18 = fadd float %tmp15, %tmp17
+  store float %tmp18, float *%arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: ds_read64_combine_stride_400:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50
+; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150
+; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250
+; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50
+define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
+bb:
+  %tmp = load double, double addrspace(3)* %arg, align 8
+  %tmp2 = fadd double %tmp, 0.000000e+00
+  %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
+  %tmp4 = load double, double addrspace(3)* %tmp3, align 8
+  %tmp5 = fadd double %tmp2, %tmp4
+  %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
+  %tmp7 = load double, double addrspace(3)* %tmp6, align 8
+  %tmp8 = fadd double %tmp5, %tmp7
+  %tmp9 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
+  %tmp10 = load double, double addrspace(3)* %tmp9, align 8
+  %tmp11 = fadd double %tmp8, %tmp10
+  %tmp12 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
+  %tmp13 = load double, double addrspace(3)* %tmp12, align 8
+  %tmp14 = fadd double %tmp11, %tmp13
+  %tmp15 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
+  %tmp16 = load double, double addrspace(3)* %tmp15, align 8
+  %tmp17 = fadd double %tmp14, %tmp16
+  %tmp18 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
+  %tmp19 = load double, double addrspace(3)* %tmp18, align 8
+  %tmp20 = fadd double %tmp17, %tmp19
+  %tmp21 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
+  %tmp22 = load double, double addrspace(3)* %tmp21, align 8
+  %tmp23 = fadd double %tmp20, %tmp22
+  store double %tmp23, double *%arg1, align 8
+  ret void
+}
+
+; GCN-LABEL: ds_read64_combine_stride_8192_shifted:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
+; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16
+; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16
+; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16
+define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
+bb:
+  %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
+  %tmp2 = load double, double addrspace(3)* %tmp, align 8
+  %tmp3 = fadd double %tmp2, 0.000000e+00
+  %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
+  %tmp5 = load double, double addrspace(3)* %tmp4, align 8
+  %tmp6 = fadd double %tmp3, %tmp5
+  %tmp7 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
+  %tmp8 = load double, double addrspace(3)* %tmp7, align 8
+  %tmp9 = fadd double %tmp6, %tmp8
+  %tmp10 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
+  %tmp11 = load double, double addrspace(3)* %tmp10, align 8
+  %tmp12 = fadd double %tmp9, %tmp11
+  %tmp13 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
+  %tmp14 = load double, double addrspace(3)* %tmp13, align 8
+  %tmp15 = fadd double %tmp12, %tmp14
+  %tmp16 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
+  %tmp17 = load double, double addrspace(3)* %tmp16, align 8
+  %tmp18 = fadd double %tmp15, %tmp17
+  store double %tmp18, double *%arg1, align 8
+  ret void
+}
+
+; GCN-LABEL: ds_write32_combine_stride_400:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
+; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
+; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
+; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
+define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) {
+bb:
+  store float 1.000000e+00, float addrspace(3)* %arg, align 4
+  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
+  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
+  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
+  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
+  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
+  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
+  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
+  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
+  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
+  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
+  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
+  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
+  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
+  store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
+  ret void
+}
+
+; GCN-LABEL: ds_write32_combine_stride_400_back:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
+; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
+; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
+; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
+define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) {
+bb:
+  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
+  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
+  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
+  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
+  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
+  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
+  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
+  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
+  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
+  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
+  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
+  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
+  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
+  store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
+  store float 1.000000e+00, float addrspace(3)* %arg, align 4
+  ret void
+}
+
+; GCN-LABEL: ds_write32_combine_stride_8192:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
+; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96
+; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160
+; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224
+define amdgpu_kernel void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) {
+bb:
+  store float 1.000000e+00, float addrspace(3)* %arg, align 4
+  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
+  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
+  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
+  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
+  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
+  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
+  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
+  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
+  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
+  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
+  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
+  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
+  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
+  store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
+  ret void
+}
+
+; GCN-LABEL: ds_write32_combine_stride_8192_shifted:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]]
+; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
+; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
+; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
+define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) {
+bb:
+  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
+  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
+  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2049
+  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
+  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4097
+  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
+  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6145
+  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
+  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8193
+  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
+  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10241
+  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
+  ret void
+}
+
+; GCN-LABEL: ds_write64_combine_stride_400:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]]
+; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
+; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150
+; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250
+; GCN-DAG: ds_write2_b64 [[B1]],   v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
+define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) {
+bb:
+  store double 1.000000e+00, double addrspace(3)* %arg, align 8
+  %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
+  store double 1.000000e+00, double addrspace(3)* %tmp, align 8
+  %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
+  store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
+  %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
+  store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
+  %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
+  store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
+  %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
+  store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
+  %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
+  store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
+  %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
+  store double 1.000000e+00, double addrspace(3)* %tmp6, align 8
+  ret void
+}
+
+; GCN-LABEL: ds_write64_combine_stride_8192_shifted:
+; GCN:     s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
+; GCN:     v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
+; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
+; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
+; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
+; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
+; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
+; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
+define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) {
+bb:
+  %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
+  store double 1.000000e+00, double addrspace(3)* %tmp, align 8
+  %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
+  store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
+  %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
+  store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
+  %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
+  store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
+  %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
+  store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
+  %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
+  store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index f461d6978f13..5997e27fd815 100644
--- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -23,7 +23,7 @@ declare void @llvm.amdgcn.s.barrier() #1
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
-define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+define amdgpu_kernel void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
 entry:
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %mul = shl nsw i32 %x.i, 1
diff --git a/test/CodeGen/AMDGPU/ds-sub-offset.ll b/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 16fb019ae0f3..d74bd5aa15ac 100644
--- a/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
 ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
 ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
-define void @write_ds_sub0_offset0_global() #0 {
+define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 0, %x.i
@@ -24,7 +24,7 @@ entry:
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535
-define void @add_x_shl_neg_to_sub_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -39,7 +39,7 @@ define void @add_x_shl_neg_to_sub_max_offset() #1 {
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]]{{$}}
-define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -58,7 +58,7 @@ define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -80,7 +80,7 @@ define void @add_x_shl_neg_to_sub_multi_use() #1 {
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -95,7 +95,7 @@ define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -109,7 +109,7 @@ define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
index 9a313230e303..2c474dbe7b08 100644
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -12,7 +12,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -31,7 +31,7 @@ define void @simple_read2_f32(float addrspace(1)* %out) #0 {
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -49,7 +49,7 @@ define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -66,7 +66,7 @@ define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -98,7 +98,7 @@ define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
 ; SI: s_barrier
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -133,7 +133,7 @@ define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -170,7 +170,7 @@ define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -196,7 +196,7 @@ define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float ad
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -219,7 +219,7 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f
 ; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
 ; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
   %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
@@ -243,7 +243,7 @@ define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
@@ -261,7 +261,7 @@ define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -280,7 +280,7 @@ define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
 ; SI-LABEL: @unaligned_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 1
@@ -296,7 +296,7 @@ define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %
 ; SI-LABEL: @misaligned_2_simple_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 2
@@ -315,7 +315,7 @@ define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrs
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f64(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -331,7 +331,7 @@ define void @simple_read2_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: @simple_read2_f64_max_offset
 ; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_endpgm
-define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -349,7 +349,7 @@ define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
 ; SI: s_endpgm
-define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -367,7 +367,7 @@ define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -385,7 +385,7 @@ define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)
 ; SI-LABEL: @load_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   %sum = add i32 %val0, %val1
@@ -396,7 +396,7 @@ define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
 ; SI-LABEL: @load_constant_disjoint_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
-define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   %sum = add i32 %val0, %val1
@@ -410,7 +410,7 @@ define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
-define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   %sum = add i64 %val0, %val1
@@ -426,7 +426,7 @@ define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
 ; SI: s_endpgm
-define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   %sum = add i64 %val0, %val1
@@ -437,7 +437,7 @@ define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
@@ -481,13 +481,13 @@ define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i
   ret void
 }
 
-define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
   store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
   %load = load i64, i64 addrspace(3)* %in, align 4
   store i64 %load, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index 4a3f3fb99700..9668743cf128 100644
--- a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
-define void @offset_order(float addrspace(1)* %out) {
+define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
 entry:
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
   %val0 = load float, float addrspace(3)* %ptr0
diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 9d8375d64037..3dfdaf3936a6 100644
--- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -12,7 +12,7 @@
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds  [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
@@ -26,7 +26,7 @@ define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out)
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
@@ -43,7 +43,7 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD2]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
@@ -68,7 +68,7 @@ define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD1]]
 ; CI: s_endpgm
-define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
   %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
@@ -88,7 +88,7 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
 ; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
@@ -101,7 +101,7 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out)
 ; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
@@ -117,7 +117,7 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
 ; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
 ; CI: s_endpgm
-define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
   %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
@@ -138,7 +138,7 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
 ; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
 ; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
 ; CI: s_endpgm
-define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
   %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
@@ -150,10 +150,10 @@ define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
 ; Do scalar loads into the super register we need.
 ; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4:
 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
-; CI-NOT: v_mov
+; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
 ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
@@ -173,10 +173,10 @@ define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspa
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4:
 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; CI-NOT: v_mov
+; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
 ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
index 99f01b4f2622..81b35a46aa18 100644
--- a/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -10,7 +10,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -29,7 +29,7 @@ define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -49,7 +49,7 @@ define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -69,7 +69,7 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
 ; SI: s_endpgm
-define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -86,7 +86,7 @@ define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, floa
 ; SI-LABEL: @odd_invalid_read2st64_f32_0
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -102,7 +102,7 @@ define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
 ; SI-LABEL: @odd_invalid_read2st64_f32_1
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
@@ -122,7 +122,7 @@ define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -141,7 +141,7 @@ define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -161,7 +161,7 @@ define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspac
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 ; SI: s_endpgm
-define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -181,7 +181,7 @@ define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspac
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 256
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -197,11 +197,11 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
 
 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
 ; SI-NOT: ds_read2st64_b64
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
+; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
+; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -218,7 +218,7 @@ define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, dou
 ; SI-LABEL: @invalid_read2st64_f64_odd_offset
 ; SI-NOT: ds_read2st64_b64
 ; SI: s_endpgm
-define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -239,7 +239,7 @@ define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double
 ; SI-NOT: ds_read2st_b64
 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
index ae230dac9378..ab1cf0ba25b5 100644
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -9,7 +9,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -27,7 +27,7 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@ define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -65,7 +65,7 @@ define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -86,7 +86,7 @@ define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
@@ -107,7 +107,7 @@ define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
@@ -126,7 +126,7 @@ define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x floa
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
   %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
@@ -146,7 +146,7 @@ define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x floa
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -164,7 +164,7 @@ define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -182,7 +182,7 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -212,7 +212,7 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -243,7 +243,7 @@ define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, f
 ; SI: ds_write_b32
 ; SI: ds_write_b32
 ; SI: s_endpgm
-define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -270,7 +270,7 @@ define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float add
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -288,7 +288,7 @@ define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -306,7 +306,7 @@ define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, doubl
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -325,7 +325,7 @@ define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace
 ; SI-LABEL: @store_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-define void @store_constant_adjacent_offsets() {
+define amdgpu_kernel void @store_constant_adjacent_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   ret void
@@ -335,7 +335,7 @@ define void @store_constant_adjacent_offsets() {
 ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
 ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
-define void @store_constant_disjoint_offsets() {
+define amdgpu_kernel void @store_constant_disjoint_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   ret void
@@ -348,7 +348,7 @@ define void @store_constant_disjoint_offsets() {
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; SI: s_endpgm
-define void @store_misaligned64_constant_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   ret void
@@ -362,7 +362,7 @@ define void @store_misaligned64_constant_offsets() {
 ; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: s_endpgm
-define void @store_misaligned64_constant_large_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   ret void
@@ -371,7 +371,7 @@ define void @store_misaligned64_constant_large_offsets() {
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %val = load float, float addrspace(1)* %in
@@ -410,7 +410,7 @@ define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, f
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
 ; CI: s_endpgm
-define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
   %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
index 872e77361406..a395af34b67b 100644
--- a/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
 ; SI: s_endpgm
-define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -25,7 +25,7 @@ define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float add
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@ define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float add
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -66,7 +66,7 @@ define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, fl
 ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
 ; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -85,7 +85,7 @@ define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, d
 ; SI-NOT: ds_write2st64_b64
 ; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
diff --git a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 580dc00f935e..b1107ea7fbcb 100644
--- a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -4,7 +4,7 @@
 
 ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
 
-define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
+define amdgpu_kernel void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
   %alloca = alloca i32, i32 %n
   store volatile i32 0, i32* %alloca
   ret void
diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
new file mode 100644
index 000000000000..ace01593808b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -0,0 +1,110 @@
+; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Most of these cases that don't trigger because of broken cost
+; heuristics. Should not need -stress-early-ifcvt
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GCN: v_cmp_neq_f64_e32 vcc, 1.0, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
+; GCN: v_add_f64 v{{\[}}[[ADD_LO:[0-9]+]]:[[ADD_HI:[0-9]+]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
+; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc
+; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+entry:
+  %v = load double, double addrspace(1)* %in
+  %cc = fcmp oeq double %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd double %v, %v
+  br label %endif
+
+endif:
+  %r = phi double [ %v, %entry ], [ %u, %if ]
+  store double %r, double addrspace(1)* %out
+  ret void
+}
+
+; vcc branch with SGPR inputs
+; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle64:
+; GCN: v_cmp_neq_f64
+; GCN: v_add_f64
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+entry:
+  %v = load double, double addrspace(2)* %in
+  %cc = fcmp oeq double %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd double %v, %v
+  br label %endif
+
+endif:
+  %r = phi double [ %v, %entry ], [ %u, %if ]
+  store double %r, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle96:
+; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
+
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: s_mov_b64 vcc, [[CMP]]
+
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+
+; GCN-DAG: buffer_store_dword v
+; GCN-DAG: buffer_store_dwordx2
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
+entry:
+  %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <3 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
+  store <3 x i32> %r, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle128:
+; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
+
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: s_mov_b64 vcc, [[CMP]]
+
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+
+; GCN: buffer_store_dwordx4
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
+entry:
+  %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <4 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
+  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll
new file mode 100644
index 000000000000..9439130deb9e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -0,0 +1,454 @@
+; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: This leaves behind a now unnecessary and with exec
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %else
+
+if:
+  %u0 = fadd float %v, %v
+  br label %endif
+
+else:
+  %u1 = fmul float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %u0, %if ], [ %u1, %else ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
+; GCN: ; clobber vcc
+; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
+; GCN: s_mov_b64 vcc, [[CMP]]
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
+entry:
+  %v = load i32, i32 addrspace(1)* %in
+  %cc = fcmp oeq float %k, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  call void asm "; clobber $0", "~{VCC}"() #0
+  %u = add i32 %v, %v
+  br label %endif
+
+endif:
+  %r = phi i32 [ %v, %entry ], [ %u, %if ]
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; Longest chain of cheap instructions to convert
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_cndmask_b32_e32
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u.0 = fmul float %v, %v
+  %u.1 = fmul float %v, %u.0
+  %u.2 = fmul float %v, %u.1
+  %u.3 = fmul float %v, %u.2
+  %u.4 = fmul float %v, %u.3
+  %u.5 = fmul float %v, %u.4
+  %u.6 = fmul float %v, %u.5
+  %u.7 = fmul float %v, %u.6
+  %u.8 = fmul float %v, %u.7
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u.8, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; Short chain of cheap instructions to not convert
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u.0 = fmul float %v, %v
+  %u.1 = fmul float %v, %u.0
+  %u.2 = fmul float %v, %u.1
+  %u.3 = fmul float %v, %u.2
+  %u.4 = fmul float %v, %u.3
+  %u.5 = fmul float %v, %u.4
+  %u.6 = fmul float %v, %u.5
+  %u.7 = fmul float %v, %u.6
+  %u.8 = fmul float %v, %u.7
+  %u.9 = fmul float %v, %u.8
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u.9, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; Should still branch over fdiv expansion
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
+; GCN: v_cmp_neq_f32_e32
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_div_scale_f32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fdiv float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; vcc branch with SGPR inputs
+; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
+; GCN: v_cmp_neq_f32_e64
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: s_add_i32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
+entry:
+  %v = load i32, i32 addrspace(2)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add i32 %v, %v
+  br label %endif
+
+endif:
+  %r = phi i32 [ %v, %entry ], [ %u, %if ]
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
+; GCN: v_cndmask_b32
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
+entry:
+  %v = load float, float addrspace(2)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; Due to broken cost heuristic, this is not if converted like
+; test_vccnz_ifcvt_triangle_constant_load even though it should be.
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
+; GCN: v_cndmask_b32
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
+entry:
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; Scalar branch and scalar inputs
+; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
+; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
+; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
+entry:
+  %v = load i32, i32 addrspace(2)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add i32 %v, %v
+  br label %endif
+
+endif:
+  %r = phi i32 [ %v, %entry ], [ %u, %if ]
+  call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
+  ret void
+}
+
+; FIXME: Should be able to use VALU compare and select
+; Scalar branch but VGPR select operands
+; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
+; GCN: s_cmp_lg_u32
+; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_add_f32_e32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
+entry:
+  %v = load i64, i64 addrspace(2)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add i64 %v, %v
+  br label %endif
+
+endif:
+  %r = phi i64 [ %v, %entry ], [ %u, %if ]
+  call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
+  ret void
+}
+
+; TODO: Can do s_cselect_b64; s_cselect_b32
+; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
+entry:
+  %v = load <3 x i32>, <3 x i32> addrspace(2)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <3 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
+  %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
+entry:
+  %v = load <4 x i32>, <4 x i32> addrspace(2)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <4 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
+  call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %else, label %if
+
+if:
+  br label %done
+
+else:
+  br label %done
+
+done:
+  %value = phi i32 [0, %if], [1, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ifcvt_undef_scc:
+; GCN: {{^}}; BB#0:
+; GCN-NEXT: s_load_dwordx2
+; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
+define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
+entry:
+  br i1 undef, label %else, label %if
+
+if:
+  br label %done
+
+else:
+  br label %done
+
+done:
+  %value = phi i32 [0, %if], [1, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
+; GCN: v_cmp_neq_f32
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_add_i32
+; GCN: v_add_i32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
+entry:
+  %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <8 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
+  store <8 x i32> %r, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
+; GCN: v_cmp_neq_f32
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_add_i32
+; GCN: v_add_i32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
+entry:
+  %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <16 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
+  store <16 x i32> %r, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/early-inline-alias.ll b/test/CodeGen/AMDGPU/early-inline-alias.ll
new file mode 100644
index 000000000000..42dfa4e7ab4f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/early-inline-alias.ll
@@ -0,0 +1,12 @@
+; RUN: opt -mtriple=amdgcn-- -O1 -S -inline-threshold=1 %s | FileCheck %s
+
+; CHECK: @add1alias = alias i32 (i32), i32 (i32)* @add1
+; CHECK: @add1alias2 = alias i32 (i32), i32 (i32)* @add1
+
+@add1alias = alias i32 (i32), i32 (i32)* @add1
+@add1alias2 = alias i32 (i32), i32 (i32)* @add1
+
+define i32 @add1(i32) {
+  %2 = add nsw i32 %0, 1
+  ret i32 %2
+}
diff --git a/test/CodeGen/AMDGPU/early-inline.ll b/test/CodeGen/AMDGPU/early-inline.ll
new file mode 100644
index 000000000000..c871d54bec7e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/early-inline.ll
@@ -0,0 +1,25 @@
+; RUN: opt -mtriple=amdgcn-- -O1 -S -inline-threshold=1 -amdgpu-early-inline-all %s | FileCheck %s
+
+; CHECK: @c_alias
+@c_alias = alias i32 (i32), i32 (i32)* @callee
+
+define i32 @callee(i32 %x) {
+entry:
+  %mul1 = mul i32 %x, %x
+  %mul2 = mul i32 %mul1, %x
+  %mul3 = mul i32 %mul1, %mul2
+  %mul4 = mul i32 %mul3, %mul2
+  %mul5 = mul i32 %mul4, %mul3
+  ret i32 %mul5
+}
+
+; CHECK-LABEL: @caller
+; CHECK: mul i32
+; CHECK-NOT: call i32
+
+define amdgpu_kernel void @caller(i32 %x) {
+entry:
+  %res = call i32 @callee(i32 %x)
+  store volatile i32 %res, i32 addrspace(1)* undef
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/elf.ll b/test/CodeGen/AMDGPU/elf.ll
index 628dd5ec839e..b22f8608d7e3 100644
--- a/test/CodeGen/AMDGPU/elf.ll
+++ b/test/CodeGen/AMDGPU/elf.ll
@@ -24,11 +24,13 @@
 ; TONGA-NEXT: .long   704
 ; CONFIG: .p2align 8
 ; CONFIG: test:
-define amdgpu_ps void @test(i32 %p) {
+define amdgpu_ps void @test(i32 %p) #0 {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r, float %r, float %r, float %r, i1 true, i1 false)
    ret void
 }
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/elf.r600.ll b/test/CodeGen/AMDGPU/elf.r600.ll
index 51cd08500932..93c5e5575033 100644
--- a/test/CodeGen/AMDGPU/elf.r600.ll
+++ b/test/CodeGen/AMDGPU/elf.r600.ll
@@ -9,7 +9,7 @@
 ; CONFIG-NEXT: .long   2
 ; CONFIG-NEXT: .long   165900
 ; CONFIG-NEXT: .long   0
-define void @test(float addrspace(1)* %out, i32 %p) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i32 %p) {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
    store float %r, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/else.ll b/test/CodeGen/AMDGPU/else.ll
index ef1e64763d4a..22338e4f50e5 100644
--- a/test/CodeGen/AMDGPU/else.ll
+++ b/test/CodeGen/AMDGPU/else.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}else_no_execfix:
 ; CHECK: ; %Flow
 ; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
 ; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
 ; CHECK-NEXT: ; mask branch
-define amdgpu_ps float @else_no_execfix(i32 %z, float %v) {
+define amdgpu_ps float @else_no_execfix(i32 %z, float %v) #0 {
 main_body:
   %cc = icmp sgt i32 %z, 5
   br i1 %cc, label %if, label %else
@@ -33,7 +33,7 @@ end:
 ; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
 ; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
 ; CHECK-NEXT: ; mask branch
-define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) {
+define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 {
 main_body:
   %cc = icmp sgt i32 %z, 5
   br i1 %cc, label %if, label %else
@@ -44,8 +44,7 @@ if:
 
 else:
   %c = fmul float %v, 3.0
-  %c.i = bitcast float %c to i32
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %v.else = extractelement <4 x float> %tex, i32 0
   br label %end
 
@@ -55,6 +54,9 @@ end:
   ret void
 }
 
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
-declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
+attributes #0 = { nounwind }
+attributes #1 = { nounwind writeonly }
+attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/empty-function.ll b/test/CodeGen/AMDGPU/empty-function.ll
index a060900811ea..1231cb4d1de2 100644
--- a/test/CodeGen/AMDGPU/empty-function.ll
+++ b/test/CodeGen/AMDGPU/empty-function.ll
@@ -7,14 +7,14 @@
 ; SI-LABEL: {{^}}empty_function_ret:
 ; SI: s_endpgm
 ; SI: codeLenInByte = 4
-define void @empty_function_ret() #0 {
+define amdgpu_kernel void @empty_function_ret() #0 {
   ret void
 }
 
 ; SI: .text
 ; SI-LABEL: {{^}}empty_function_unreachable:
 ; SI: codeLenInByte = 0
-define void @empty_function_unreachable() #0 {
+define amdgpu_kernel void @empty_function_unreachable() #0 {
   unreachable
 }
 
diff --git a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
new file mode 100644
index 000000000000..6eb1fc1d0cc2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE %s
+; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
+; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
+
+; Test that the -enable-no-signed-zeros-fp-math flag works
+
+; GCN-LABEL: {{^}}fneg_fsub_f32:
+; GCN: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
+
+; GCN-UNSAFE-NOT: xor
+define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %a = load float, float addrspace(1)* %in, align 4
+  %b = load float, float addrspace(1)* %b_ptr, align 4
+  %result = fsub float %a, %b
+  %neg.result = fsub float -0.0, %result
+  store float %neg.result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll
index c67095438ee5..bd861e0c663e 100644
--- a/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -12,7 +12,7 @@
 ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
 ; CHECK-NOT: s_or_b64 exec, exec
 ; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   %cond = call i32 @llvm.r600.read.tidig.x() #0
   %tmp0 = icmp eq i32 %cond, 0
diff --git a/test/CodeGen/AMDGPU/env-amdgiz.ll b/test/CodeGen/AMDGPU/env-amdgiz.ll
new file mode 100644
index 000000000000..70e4fb30d3aa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/env-amdgiz.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa-amdgiz -verify-machineinstrs < %s
+; Just check the target feature and data layout is accepted without error.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
+target triple = "amdgcn-amd-amdhsa-amdgiz"
+
+define void @foo() {
+entry:
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/env-amdgizcl.ll b/test/CodeGen/AMDGPU/env-amdgizcl.ll
new file mode 100644
index 000000000000..feb213562c80
--- /dev/null
+++ b/test/CodeGen/AMDGPU/env-amdgizcl.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa-amdgizcl -verify-machineinstrs < %s
+; Just check the target feature and data layout is accepted without error.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
+target triple = "amdgcn-amd-amdhsa-amdgizcl"
+
+define void @foo() {
+entry:
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
index 8ef54b9e95d3..40d115bfc060 100644
--- a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
+++ b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_tahiti
-define void @use_too_many_sgprs_tahiti() #0 {
+define amdgpu_kernel void @use_too_many_sgprs_tahiti() #0 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -20,7 +20,7 @@ define void @use_too_many_sgprs_tahiti() #0 {
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire
-define void @use_too_many_sgprs_bonaire() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -39,7 +39,7 @@ define void @use_too_many_sgprs_bonaire() #1 {
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr
-define void @use_too_many_sgprs_bonaire_flat_scr() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -59,7 +59,7 @@ define void @use_too_many_sgprs_bonaire_flat_scr() #1 {
 }
 
 ; ERROR: error: scalar registers limit of 96 exceeded (98) in use_too_many_sgprs_iceland
-define void @use_too_many_sgprs_iceland() #2 {
+define amdgpu_kernel void @use_too_many_sgprs_iceland() #2 {
   call void asm sideeffect "", "~{VCC}" ()
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
@@ -77,7 +77,7 @@ define void @use_too_many_sgprs_iceland() #2 {
 }
 
 ; ERROR: error: addressable scalar registers limit of 102 exceeded (103) in use_too_many_sgprs_fiji
-define void @use_too_many_sgprs_fiji() #3 {
+define amdgpu_kernel void @use_too_many_sgprs_fiji() #3 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
diff --git a/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll b/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
index cf384da2c5be..0fa06b87eba2 100644
--- a/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
+++ b/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: and_zext:
 ; GCN: v_and_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -18,7 +18,7 @@ define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; GCN-LABEL: or_zext:
 ; GCN: v_or_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -33,7 +33,7 @@ define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; GCN-LABEL: xor_zext:
 ; GCN: v_xor_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/extload-align.ll b/test/CodeGen/AMDGPU/extload-align.ll
index 9d2eb74c7ba9..4644800421d8 100644
--- a/test/CodeGen/AMDGPU/extload-align.ll
+++ b/test/CodeGen/AMDGPU/extload-align.ll
@@ -9,7 +9,7 @@
 ; DEBUG: mem:LD2[<unknown>]{{[^(]}}
 ; DEBUG: {{^}}# End machine code for function extload_align.
 
-define void @extload_align(i32* %out, i32 %index) #0 {
+define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 {
   %v0 = alloca [4 x i16]
   %a1 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 0
   %a2 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 1
diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll
index 6cebe5f495c5..fd298b361d03 100644
--- a/test/CodeGen/AMDGPU/extload-private.ll
+++ b/test/CodeGen/AMDGPU/extload-private.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
-; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}}
-define void @load_i8_sext_private(i32 addrspace(1)* %out) {
+; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -13,8 +13,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i8_zext_private:
-; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}}
-define void @load_i8_zext_private(i32 addrspace(1)* %out) {
+; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -24,8 +24,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_sext_private:
-; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}}
-define void @load_i16_sext_private(i32 addrspace(1)* %out) {
+; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load i16, i16* %tmp0
@@ -35,8 +35,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_zext_private:
-; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}}
-define void @load_i16_zext_private(i32 addrspace(1)* %out) {
+; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load volatile i16, i16* %tmp0
diff --git a/test/CodeGen/AMDGPU/extload.ll b/test/CodeGen/AMDGPU/extload.ll
index 8b3e087d1f45..a7b8e86220aa 100644
--- a/test/CodeGen/AMDGPU/extload.ll
+++ b/test/CodeGen/AMDGPU/extload.ll
@@ -10,7 +10,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -25,7 +25,7 @@ define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 a
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <2 x i16>
@@ -40,7 +40,7 @@ define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i1
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -55,7 +55,7 @@ define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 ad
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <2 x i16>
diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
index 4edff152e66e..be85ca933c33 100644
--- a/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
+++ b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
@@ -13,7 +13,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                     <4 x i32> addrspace(1)* noalias %out1,
                                                     i32 addrspace(1)* noalias %out2,
                                                     i32 addrspace(1)* %in) {
@@ -55,7 +55,7 @@ define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noal
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                             <4 x i32> addrspace(1)* noalias %out1,
                                                             i32 addrspace(1)* noalias %out2,
                                                             i32 addrspace(1)* %in) {
@@ -99,7 +99,7 @@ define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(
 
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
                                                                      <4 x i32> addrspace(1)* noalias %out1,
                                                                      i64 addrspace(1)* noalias %out2,
                                                                      i32 addrspace(1)* %in) {
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
new file mode 100644
index 000000000000..1f567ae05081
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}extract_vector_elt_v2f16:
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
+; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]]
+; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
+; GCN-DAG: buffer_store_short [[VELT0]]
+; GCN-DAG: buffer_store_short [[VELT1]]
+define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+  %p0 = extractelement <2 x half> %vec, i32 0
+  %p1 = extractelement <2 x half> %vec, i32 1
+  %out1 = getelementptr half, half addrspace(1)* %out, i32 10
+  store half %p1, half addrspace(1)* %out, align 2
+  store half %p0, half addrspace(1)* %out1, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_sgpr:
+; GCN: s_load_dword [[IDX:s[0-9]+]]
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 16
+; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]]
+; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
+; GCN: buffer_store_short [[VELT1]]
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+  %elt = extractelement <2 x half> %vec, i32 %idx
+  store half %elt, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_vgpr:
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: {{flat|buffer}}_load_dword [[IDX:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 16, [[IDX]]
+
+; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]]
+; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]]
+
+
+; SI: buffer_store_short [[ELT]]
+; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
+; GCN: ScratchSize: 0{{$}}
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+  %idx = load i32, i32 addrspace(1)* %gep
+  %elt = extractelement <2 x half> %vec, i32 %idx
+  store half %elt, half addrspace(1)* %out.gep, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
+  %p0 = extractelement <3 x half> %foo, i32 0
+  %p1 = extractelement <3 x half> %foo, i32 2
+  %out1 = getelementptr half, half addrspace(1)* %out, i32 1
+  store half %p1, half addrspace(1)* %out, align 2
+  store half %p0, half addrspace(1)* %out1, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}extract_vector_elt_v4f16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
+  %p0 = extractelement <4 x half> %foo, i32 0
+  %p1 = extractelement <4 x half> %foo, i32 2
+  %out1 = getelementptr half, half addrspace(1)* %out, i32 10
+  store half %p1, half addrspace(1)* %out, align 2
+  store half %p0, half addrspace(1)* %out1, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
+  %p0 = extractelement <3 x half> %foo, i32 %idx
+  %out1 = getelementptr half, half addrspace(1)* %out, i32 1
+  store half %p0, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4f16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
+  %p0 = extractelement <4 x half> %foo, i32 %idx
+  %out1 = getelementptr half, half addrspace(1)* %out, i32 1
+  store half %p0, half addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index 4594379dae03..db5bf0b4e808 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -5,7 +5,7 @@
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in
   %elt = extractelement <3 x double> %ld, i32 2
   store volatile double %elt, double addrspace(1)* %out
@@ -13,14 +13,14 @@ define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double>
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
-define void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
-define void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index c407f0efffb4..9b117d48a980 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -1,25 +1,67 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) #0 {
-  %p0 = extractelement <2 x i16> %foo, i32 0
-  %p1 = extractelement <2 x i16> %foo, i32 1
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
+; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]]
+; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
+; GCN-DAG: buffer_store_short [[VELT0]]
+; GCN-DAG: buffer_store_short [[VELT1]]
+define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %p0 = extractelement <2 x i16> %vec, i32 0
+  %p1 = extractelement <2 x i16> %vec, i32 1
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
   store i16 %p1, i16 addrspace(1)* %out, align 2
   store i16 %p0, i16 addrspace(1)* %out1, align 2
   ret void
 }
 
+; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_sgpr:
+; GCN: s_load_dword [[IDX:s[0-9]+]]
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 16
+; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]]
+; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
+; GCN: buffer_store_short [[VELT1]]
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %elt = extractelement <2 x i16> %vec, i32 %idx
+  store i16 %elt, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_vgpr:
+; GCN-DAG: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: {{flat|buffer}}_load_dword [[IDX:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 16, [[IDX]]
+
+; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]]
+; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]]
+
+; SI: buffer_store_short [[ELT]]
+; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
+; GCN: ScratchSize: 0{{$}}
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
+  %idx = load volatile i32, i32 addrspace(1)* %gep
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %elt = extractelement <2 x i16> %vec, i32 %idx
+  store i16 %elt, i16 addrspace(1)* %out.gep, align 2
+  ret void
+}
+
 ; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 0
   %p1 = extractelement <3 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
@@ -29,16 +71,23 @@ define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+
+; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c
+; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30
+; GFX9-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], [[LOAD0]]
+; GFX9-DAG: buffer_store_short [[VLOAD0]], off
+; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]]
+; GFX9-DAG: buffer_store_short [[VLOAD1]], off
+define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
-  store i16 %p1, i16 addrspace(1)* %out, align 2
-  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  store volatile i16 %p1, i16 addrspace(1)* %out, align 2
+  store volatile i16 %p0, i16 addrspace(1)* %out1, align 2
   ret void
 }
 
@@ -47,13 +96,16 @@ define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0
 ; GCN: buffer_load_ushort
 ; GCN: buffer_load_ushort
 
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+
+; GFX9: buffer_store_dword
+; GFX9: buffer_store_dword
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
@@ -61,23 +113,32 @@ define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16>
 }
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
 
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
 
-; GCN: buffer_load_ushort
-; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_store_short
+
+; GFX9: s_load_dword
+; GFX9: buffer_store_dword
+; GFX9: buffer_store_dword
+; GFX9: buffer_load_ushort
+; GFX9: buffer_store_short
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
   ret void
 }
 
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
 attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index 1df91c93329a..a8d127879a32 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -8,7 +8,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
+define amdgpu_kernel void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
   %vec = bitcast i64 %val to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
   %elt1 = extractelement <2 x i32> %vec, i32 1
@@ -20,7 +20,7 @@ define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspa
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i64:
-define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
   %p0 = extractelement <2 x i64> %foo, i32 0
   %p1 = extractelement <2 x i64> %foo, i32 1
   %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
@@ -30,14 +30,14 @@ define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
-define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
-define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
   %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
   %or = or <2 x i64> %load, %arst
   %dynelt = extractelement <2 x i64> %or, i32 %elt
@@ -46,14 +46,14 @@ define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> ad
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
-define void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
-define void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 6f4ae827f432..b7d768fd5525 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}extract_vector_elt_v1i8:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
   %p0 = extractelement <1 x i8> %foo, i32 0
   store i8 %p0, i8 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
   %p0 = extractelement <2 x i8> %foo, i32 0
   %p1 = extractelement <2 x i8> %foo, i32 1
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -29,7 +29,7 @@ define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 0
   %p1 = extractelement <3 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -43,7 +43,7 @@ define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 0
   %p1 = extractelement <4 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -57,7 +57,7 @@ define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
   %p0 = extractelement <8 x i8> %foo, i32 0
   %p1 = extractelement <8 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -71,7 +71,7 @@ define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
   %p0 = extractelement <16 x i8> %foo, i32 0
   %p1 = extractelement <16 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -85,7 +85,7 @@ define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
   %p0 = extractelement <32 x i8> %foo, i32 0
   %p1 = extractelement <32 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -99,7 +99,7 @@ define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
   %p0 = extractelement <64 x i8> %foo, i32 0
   %p1 = extractelement <64 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -120,7 +120,7 @@ define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out
@@ -141,7 +141,7 @@ define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %fo
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
index e160c20a03a0..34999fa3aea4 100644
--- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -7,7 +7,7 @@
 ; GCN-DAG: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x i32>
@@ -20,7 +20,7 @@ define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspa
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_f64
 ; GCN: buffer_store_dword v
-define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
+define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
    %a = load double, double addrspace(1)* %in
    %add = fadd double %a, %b
    %val.bc = bitcast double %add to <2 x i32>
@@ -33,7 +33,7 @@ define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrs
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_i32
 ; GCN: buffer_store_dword
-define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x float>
@@ -45,7 +45,7 @@ define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 add
 ; GCN-LABEL: {{^}}no_extract_volatile_load_extract0:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt0 = extractelement <4 x i32> %vec, i32 0
@@ -57,7 +57,7 @@ entry:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
 
-define void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt2 = extractelement <4 x i32> %vec, i32 2
@@ -68,7 +68,7 @@ entry:
 ; GCN-LABEL: {{^}}no_extract_volatile_load_dynextract:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %eltN = extractelement <4 x i32> %vec, i32 %idx
diff --git a/test/CodeGen/AMDGPU/fabs.f16.ll b/test/CodeGen/AMDGPU/fabs.f16.ll
index c64aa6228c71..d4ef7124a334 100644
--- a/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -1,69 +1,74 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; DAGCombiner will transform:
 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
 ; unless isFabsFree returns true
 
-; GCN-LABEL: {{^}}fabs_free_f16:
+; GCN-LABEL: {{^}}s_fabs_free_f16:
 ; GCN: flat_load_ushort [[VAL:v[0-9]+]],
 ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 
-define void @fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc= bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
   store half %fabs, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}fabs_f16:
+; GCN-LABEL: {{^}}s_fabs_f16:
 ; CI: flat_load_ushort [[VAL:v[0-9]+]],
-; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]]
-; CI: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], |[[CVT0]]|
+; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fabs_f16(half addrspace(1)* %out, half %in) {
+define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   store half %fabs, half addrspace(1)* %out
   ret void
 }
 
 ; FIXME: Should be able to use single and
-; GCN-LABEL: {{^}}fabs_v2f16:
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+; GCN-LABEL: {{^}}s_fabs_v2f16:
+; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_or_b32_e32
 
-; VI: flat_load_ushort [[LO:v[0-9]+]]
 ; VI: flat_load_ushort [[HI:v[0-9]+]]
+; VI: flat_load_ushort [[LO:v[0-9]+]]
 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
-; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[LO]]
 ; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
-; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
-; VI: v_or_b32
+; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[MASK]], [[LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]]
 ; VI: flat_store_dword
-define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
+define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}fabs_v4f16:
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+; GCN-LABEL: {{^}}s_fabs_v4f16:
+; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
 
 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI:     v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; GCN: flat_store_dwordx2
-define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   store <4 x half> %fabs, <4 x half> addrspace(1)* %out
   ret void
@@ -72,22 +77,74 @@ define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
 ; GCN-LABEL: {{^}}fabs_fold_f16:
 ; GCN: flat_load_ushort [[IN0:v[0-9]+]]
 ; GCN: flat_load_ushort [[IN1:v[0-9]+]]
+
 ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
-; CI-DAG: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], [[IN1]]
-; CI: v_mul_f32_e64 [[RESULT:v[0-9]+]],  |[[CVT1]]|, [[CVT0]]
+; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]|
+; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]],  [[CVT0]], [[ABS_CVT1]]
 ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
 
 ; VI-NOT: and
 ; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
+define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
   %fabs = call half @llvm.fabs.f16(half %in0)
   %fmul = fmul half %fabs, %in1
   store half %fmul, half addrspace(1)* %out
   ret void
 }
 
-declare half @llvm.fabs.f16(half) readnone
-declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone
-declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone
+; GCN-LABEL: {{^}}v_fabs_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]]
+define amdgpu_kernel void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
+  store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_free_v2f16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
+define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+  %bc = bitcast i32 %in to <2 x half>
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
+  store <2 x half> %fabs, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fabs_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+
+; CI: v_cvt_f32_f16_e32
+; CI: v_cvt_f32_f16_e32
+; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+
+; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
+define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
+  %fmul = fmul <2 x half> %fabs, %val
+  store <2 x half> %fmul, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll
index f7780b875ff5..998e02f7bdf8 100644
--- a/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -10,7 +10,7 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
 ; FUNC-LABEL: {{^}}v_fabs_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tidext = sext i32 %tid to i64
   %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
@@ -24,7 +24,7 @@ define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; SI: v_and_b32
 ; SI-NOT: v_and_b32
 ; SI: s_endpgm
-define void @fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   store double %fabs, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @fabs_f64(double addrspace(1)* %out, double %in) {
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   store <2 x double> %fabs, <2 x double> addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   store <4 x double> %fabs, <4 x double> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @llvm.fabs.f64(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1)
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @fabs(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -79,7 +79,7 @@ define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in
 ; FUNC-LABEL: {{^}}fabs_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   store double %fabs, double addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
 ; FUNC-LABEL: {{^}}fabs_fn_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   store double %fabs, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll
index 98e7f9e3e9ad..ac8fa3e45ef5 100644
--- a/test/CodeGen/AMDGPU/fabs.ll
+++ b/test/CodeGen/AMDGPU/fabs.ll
@@ -13,7 +13,7 @@
 
 ; GCN: v_and_b32
 
-define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
 
 ; GCN: v_and_b32
 
-define void @fabs_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -37,7 +37,7 @@ define void @fabs_free(float addrspace(1)* %out, i32 %in) {
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
 ; GCN: v_and_b32
-define void @fabs_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   store float %fabs, float addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @fabs_f32(float addrspace(1)* %out, float %in) {
 
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; GCN: v_and_b32
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   store <4 x float> %fabs, <4 x float> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@ define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out
@@ -88,7 +88,7 @@ define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index b74bce76f79c..9edf55cbc69f 100644
--- a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -28,7 +28,7 @@
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -55,7 +55,7 @@ define void @fast_add_fmuladd_fmul() #0 {
 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
 ; GCN-FASTFMA: buffer_store_dword [[FMA1]]
-define void @fast_sub_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -87,7 +87,7 @@ define void @fast_sub_fmuladd_fmul() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -120,7 +120,7 @@ define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -145,7 +145,7 @@ define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -170,7 +170,7 @@ define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -205,7 +205,7 @@ define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
 
 ; GCN: buffer_store_dword [[MUL]]
 ; GCN: buffer_store_dword [[MAD]]
-define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -241,7 +241,7 @@ define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_subrev_f32_e32
-define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll
index 9ca077564e2b..f76ecf58d905 100644
--- a/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -11,7 +11,7 @@
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16(
+define amdgpu_kernel void @fadd_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -25,14 +25,13 @@ entry:
 
 ; GCN-LABEL: {{^}}fadd_f16_imm_a
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3c00{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
+; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_a(
+define amdgpu_kernel void @fadd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -44,14 +43,13 @@ entry:
 
 ; GCN-LABEL: {{^}}fadd_f16_imm_b
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4000{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_b(
+define amdgpu_kernel void @fadd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -61,27 +59,31 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2f16
+; GCN-LABEL: {{^}}fadd_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16(
+define amdgpu_kernel void @fadd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -93,25 +95,26 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2f16_imm_a
+; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x3c00{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4000{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
+; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_a(
+define amdgpu_kernel void @fadd_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -121,25 +124,26 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2f16_imm_b
+; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4000{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x3c00{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
+; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[A_F16_1]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_1]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_b(
+define amdgpu_kernel void @fadd_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fadd.ll b/test/CodeGen/AMDGPU/fadd.ll
index 0f683f7bfa23..621a0de281db 100644
--- a/test/CodeGen/AMDGPU/fadd.ll
+++ b/test/CodeGen/AMDGPU/fadd.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}fadd_f32:
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
 ; SI: v_add_f32
-define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 {
    %add = fadd float %a, %b
    store float %add, float addrspace(1)* %out, align 4
    ret void
@@ -16,7 +16,7 @@ define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
 ; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
   %add = fadd <2 x float> %a, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -31,7 +31,7 @@ define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
   %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
@@ -57,8 +57,19 @@ define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) {
+define amdgpu_kernel void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 {
   %add = fadd <8 x float> %a, %b
   store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
+
+; FUNC-LABEL: {{^}}fadd_0_nsz_attr_f32:
+; SI-NOT: v_add_f32
+define amdgpu_kernel void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 {
+   %add = fadd float %a, 0.0
+   store float %add, float addrspace(1)* %out, align 4
+   ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll
index 6f0c9de8ebaf..7eb7747de215 100644
--- a/test/CodeGen/AMDGPU/fadd64.ll
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}v_fadd_f64:
 ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
-define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                         double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -14,7 +14,7 @@ define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}s_fadd_f64:
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
+define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
   %r2 = fadd double %r0, %r1
   store double %r2, double addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
 ; CHECK: v_add_f64
 ; CHECK: v_add_f64
 ; CHECK: _store_dwordx4
-define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                           <2 x double> addrspace(1)* %in2) {
   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -37,7 +37,7 @@ define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: _store_dwordx4
-define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
+define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
   %r2 = fadd <2 x double> %r0, %r1
   store <2 x double> %r2, <2 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index ad3992f4cd03..f2686a5582dc 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -1,11 +1,15 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
+declare half @llvm.fabs.f16(half) #0
 declare half @llvm.canonicalize.f16(half) #0
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
+declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
@@ -15,17 +19,51 @@ define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
   %val = bitcast i16 %val.arg to half
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
+; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
+; GCN: buffer_store_short [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
+  %val = load half, half addrspace(1)* %out
+  %val.fabs = call half @llvm.fabs.f16(half %val)
+  %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
+; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
+; GCN: buffer_store_short [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
+  %val = load half, half addrspace(1)* %out
+  %val.fabs = call half @llvm.fabs.f16(half %val)
+  %val.fabs.fneg = fsub half -0.0, %val.fabs
+  %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
+; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
+; GCN: buffer_store_short [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
+  %val = load half, half addrspace(1)* %out
+  %val.fneg = fsub half -0.0, %val
+  %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -34,7 +72,7 @@ define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -43,7 +81,7 @@ define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -52,7 +90,7 @@ define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -61,16 +99,16 @@ define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f16:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -79,16 +117,16 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f16:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -97,7 +135,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -106,7 +144,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -115,7 +153,7 @@ define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -124,7 +162,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out)
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -133,7 +171,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out)
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -142,7 +180,7 @@ define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -151,7 +189,7 @@ define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -160,13 +198,244 @@ define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
+; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}}
+; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI-NOT: v_and_b32
+
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
+; GFX9: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %out
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Fold modifier
+; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
+; VI-DAG: v_bfe_u32
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
+; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI-NOT: 0xffff
+; VI: v_or_b32
+
+; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
+; GCN: buffer_store_dword
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %out
+  %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
+; VI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
+; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI: v_or_b32
+
+; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
+; GCN: buffer_store_dword
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %out
+  %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
+  %val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Fold modifier
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
+; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
+; VI-DAG: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
+; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
+; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI-NOT: 0xffff
+
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
+; GFX9: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %out
+  %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
+; VI: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}}
+; VI-DAG: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI-NOT: v_and_b32
+
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
+; GFX9: buffer_store_dword [[REG]]
+define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
+  %val = bitcast i32 %val.arg to <2 x half>
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
+  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "target-features"="-flat-for-global" }
-attributes #2 = { nounwind "target-features"="-flat-for-global,-fp16-denormals,-fp16-denormals" }
-attributes #3 = { nounwind "target-features"="-flat-for-global,+fp16-denormals,+fp64-denormals" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "target-features"="-fp64-fp16-denormals" }
+attributes #3 = { nounwind "target-features"="+fp64-fp16-denormals" }
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll
index 981d88dfe94e..8c385f40b1c5 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -1,12 +1,14 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
+declare float @llvm.fabs.f32(float) #0
 declare float @llvm.canonicalize.f32(float) #0
+declare double @llvm.fabs.f64(double) #0
 declare double @llvm.canonicalize.f64(double) #0
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
@@ -16,16 +18,50 @@ define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32:
+; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
+  %val = load float, float addrspace(1)* %out
+  %val.fabs = call float @llvm.fabs.f32(float %val)
+  %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32:
+; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
+  %val = load float, float addrspace(1)* %out
+  %val.fabs = call float @llvm.fabs.f32(float %val)
+  %val.fabs.fneg = fsub float -0.0, %val.fabs
+  %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs.fneg)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32:
+; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
+  %val = load float, float addrspace(1)* %out
+  %val.fneg = fsub float -0.0, %val
+  %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -34,7 +70,7 @@ define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -43,7 +79,7 @@ define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -52,7 +88,7 @@ define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -61,7 +97,7 @@ define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -70,7 +106,7 @@ define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -79,7 +115,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -88,7 +124,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)*
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -97,7 +133,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -106,7 +142,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)*
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -115,7 +151,7 @@ define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -124,7 +160,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -133,7 +169,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -142,7 +178,7 @@ define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -151,7 +187,7 @@ define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -160,7 +196,7 @@ define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -169,7 +205,7 @@ define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
@@ -179,17 +215,51 @@ define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}|
+; GCN: buffer_store_dwordx2 [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
+  %val = load double, double addrspace(1)* %out
+  %val.fabs = call double @llvm.fabs.f64(double %val)
+  %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}|
+; GCN: buffer_store_dwordx2 [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
+  %val = load double, double addrspace(1)* %out
+  %val.fabs = call double @llvm.fabs.f64(double %val)
+  %val.fabs.fneg = fsub double -0.0, %val.fabs
+  %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs.fneg)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2 [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
+  %val = load double, double addrspace(1)* %out
+  %val.fneg = fsub double -0.0, %val
+  %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -199,7 +269,7 @@ define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -209,7 +279,7 @@ define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -219,7 +289,7 @@ define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -229,7 +299,7 @@ define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -239,7 +309,7 @@ define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -249,7 +319,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -259,7 +329,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)*
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -269,7 +339,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -279,7 +349,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -289,7 +359,7 @@ define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -299,7 +369,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -309,7 +379,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -319,7 +389,7 @@ define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -329,7 +399,7 @@ define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -339,7 +409,7 @@ define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -347,5 +417,5 @@ define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
-attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" }
-attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" }
+attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
+attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
diff --git a/test/CodeGen/AMDGPU/fceil.ll b/test/CodeGen/AMDGPU/fceil.ll
index efdda78f852b..0b913fda8580 100644
--- a/test/CodeGen/AMDGPU/fceil.ll
+++ b/test/CodeGen/AMDGPU/fceil.ll
@@ -13,7 +13,7 @@ declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
 ; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fceil_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.ceil.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
@@ -25,7 +25,7 @@ define void @fceil_f32(float addrspace(1)* %out, float %x) {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
@@ -41,7 +41,7 @@ define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
 ; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+define amdgpu_kernel void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
   %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
   store <3 x float> %y, <3 x float> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@ define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
@@ -125,7 +125,7 @@ define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll
index 98448db5dd24..61572a855620 100644
--- a/test/CodeGen/AMDGPU/fceil64.ll
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -31,7 +31,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; SI: v_cndmask_b32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @fceil_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @fceil_f64(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}fceil_v2f64:
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
-; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
@@ -61,7 +61,7 @@ define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@ define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fcmp-cnd.ll b/test/CodeGen/AMDGPU/fcmp-cnd.ll
index 530274f920f0..7f8be804309e 100644
--- a/test/CodeGen/AMDGPU/fcmp-cnd.ll
+++ b/test/CodeGen/AMDGPU/fcmp-cnd.ll
@@ -4,7 +4,7 @@
 ;registers and literal.x depending on what the optimizer does.
 ;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
diff --git a/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
index c402805feb39..2a848e80b81b 100644
--- a/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
+++ b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
@@ -6,7 +6,7 @@
 
 ; CHECK: SET{{[A-Z]+}}_DX10
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll
index a62726f7f068..7916226462f7 100644
--- a/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt(
+define amdgpu_kernel void @fcmp_f16_lt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -28,16 +28,16 @@ entry:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
+; SI:  v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]|
+; SI:  v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]|
 
-; SI:  v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F32]]|, |v[[B_F32]]|
+; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
 ; VI:  v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]|
 
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt_abs(
+define amdgpu_kernel void @fcmp_f16_lt_abs(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -62,7 +62,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_eq(
+define amdgpu_kernel void @fcmp_f16_eq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -85,7 +85,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_le(
+define amdgpu_kernel void @fcmp_f16_le(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -108,7 +108,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_gt(
+define amdgpu_kernel void @fcmp_f16_gt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -131,7 +131,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lg(
+define amdgpu_kernel void @fcmp_f16_lg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -154,7 +154,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ge(
+define amdgpu_kernel void @fcmp_f16_ge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -177,7 +177,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_o(
+define amdgpu_kernel void @fcmp_f16_o(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -200,7 +200,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_u(
+define amdgpu_kernel void @fcmp_f16_u(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -223,7 +223,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nge(
+define amdgpu_kernel void @fcmp_f16_nge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -246,7 +246,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlg(
+define amdgpu_kernel void @fcmp_f16_nlg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -269,7 +269,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ngt(
+define amdgpu_kernel void @fcmp_f16_ngt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -292,7 +292,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nle(
+define amdgpu_kernel void @fcmp_f16_nle(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -315,7 +315,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_neq(
+define amdgpu_kernel void @fcmp_f16_neq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -338,7 +338,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlt(
+define amdgpu_kernel void @fcmp_f16_nlt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -368,7 +368,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lt(
+define amdgpu_kernel void @fcmp_v2f16_lt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -398,7 +398,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_eq(
+define amdgpu_kernel void @fcmp_v2f16_eq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -428,7 +428,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_le(
+define amdgpu_kernel void @fcmp_v2f16_le(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -458,7 +458,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_gt(
+define amdgpu_kernel void @fcmp_v2f16_gt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -488,7 +488,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lg(
+define amdgpu_kernel void @fcmp_v2f16_lg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -518,7 +518,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ge(
+define amdgpu_kernel void @fcmp_v2f16_ge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -548,7 +548,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_o(
+define amdgpu_kernel void @fcmp_v2f16_o(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -578,7 +578,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_u(
+define amdgpu_kernel void @fcmp_v2f16_u(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -608,7 +608,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nge(
+define amdgpu_kernel void @fcmp_v2f16_nge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -638,7 +638,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlg(
+define amdgpu_kernel void @fcmp_v2f16_nlg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -668,7 +668,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ngt(
+define amdgpu_kernel void @fcmp_v2f16_ngt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -698,7 +698,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nle(
+define amdgpu_kernel void @fcmp_v2f16_nle(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -728,7 +728,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_neq(
+define amdgpu_kernel void @fcmp_v2f16_neq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -758,7 +758,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlt(
+define amdgpu_kernel void @fcmp_v2f16_nlt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
diff --git a/test/CodeGen/AMDGPU/fcmp.ll b/test/CodeGen/AMDGPU/fcmp.ll
index 97d954fcc3c2..b548670edb06 100644
--- a/test/CodeGen/AMDGPU/fcmp.ll
+++ b/test/CodeGen/AMDGPU/fcmp.ll
@@ -3,7 +3,7 @@
 ; CHECK: {{^}}fcmp_sext:
 ; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
@@ -22,7 +22,7 @@ entry:
 ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
 ; CHECK-NEXT: {{[0-9]+\(5.0}}
 
-define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_br(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   br i1 %0, label %IF, label %ENDIF
diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll
index acce82fdfe53..b9e1921d4c45 100644
--- a/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/test/CodeGen/AMDGPU/fcmp64.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}flt_f64:
 ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@ define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fle_f64:
 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -27,7 +27,7 @@ define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fgt_f64:
 ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -39,7 +39,7 @@ define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fge_f64:
 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -51,7 +51,7 @@ define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fne_f64:
 ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -63,7 +63,7 @@ define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}feq_f64:
 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll
index 89af37545c99..125597796245 100644
--- a/test/CodeGen/AMDGPU/fconst64.ll
+++ b/test/CodeGen/AMDGPU/fconst64.ll
@@ -5,7 +5,7 @@
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
 
-define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double, double addrspace(1)* %in
    %r2 = fadd double %r1, 5.000000e+00
    store double %r2, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fcopysign.f16.ll b/test/CodeGen/AMDGPU/fcopysign.f16.ll
new file mode 100644
index 000000000000..4e2bf765cd95
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -0,0 +1,264 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+
+declare half @llvm.copysign.f16(half, half)
+declare float @llvm.copysign.f32(float, float)
+declare double @llvm.copysign.f64(double, double)
+declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>)
+declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>)
+declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
+
+; GCN-LABEL: {{^}}test_copysign_f16:
+; SI: buffer_load_ushort v[[SIGN:[0-9]+]]
+; SI: buffer_load_ushort v[[MAG:[0-9]+]]
+; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
+; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
+; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
+; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
+; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GFX89: buffer_load_ushort v[[MAG:[0-9]+]]
+; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
+; GCN: buffer_store_short v[[OUT]]
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_f16(
+  half addrspace(1)* %arg_out,
+  half addrspace(1)* %arg_mag,
+  half addrspace(1)* %arg_sign) {
+entry:
+  %mag = load half, half addrspace(1)* %arg_mag
+  %sign = load half, half addrspace(1)* %arg_sign
+  %out = call half @llvm.copysign.f16(half %mag, half %sign)
+  store half %out, half addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
+; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
+; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
+; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
+; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
+; GCN: buffer_store_dword v[[OUT]]
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32(
+  float addrspace(1)* %arg_out,
+  half addrspace(1)* %arg_mag,
+  float addrspace(1)* %arg_sign) {
+entry:
+  %mag = load half, half addrspace(1)* %arg_mag
+  %mag.ext = fpext half %mag to float
+  %sign = load float, float addrspace(1)* %arg_sign
+  %out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
+  store float %out, float addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
+; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
+; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]]
+; GCN: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_EXT_HI]], v[[SIGN_HI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_EXT_LO]]:[[OUT_HI]]{{\]}}
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
+  double addrspace(1)* %arg_out,
+  half addrspace(1)* %arg_mag,
+  double addrspace(1)* %arg_sign) {
+entry:
+  %mag = load half, half addrspace(1)* %arg_mag
+  %mag.ext = fpext half %mag to double
+  %sign = load double, double addrspace(1)* %arg_sign
+  %out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
+  store double %out, double addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
+; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
+; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
+; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
+; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]]
+; GFX89-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
+; GCN: buffer_store_dword v[[OUT]]
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16(
+  float addrspace(1)* %arg_out,
+  float addrspace(1)* %arg_mag,
+  half addrspace(1)* %arg_sign) {
+entry:
+  %mag = load float, float addrspace(1)* %arg_mag
+  %sign = load half, half addrspace(1)* %arg_sign
+  %sign.ext = fpext half %sign to float
+  %out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
+  store float %out, float addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
+; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
+; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
+; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
+; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]]
+; GFX89-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
+; GFX89: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_LO]]:[[OUT_HI]]{{\]}}
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
+  double addrspace(1)* %arg_out,
+  double addrspace(1)* %arg_mag,
+  half addrspace(1)* %arg_sign) {
+entry:
+  %mag = load double, double addrspace(1)* %arg_mag
+  %sign = load half, half addrspace(1)* %arg_sign
+  %sign.ext = fpext half %sign to double
+  %out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
+  store double %out, double addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
+; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
+; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
+; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
+; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]]
+; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
+; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
+; GFX89-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
+; GCN: buffer_store_short v[[OUT]]
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32(
+  half addrspace(1)* %arg_out,
+  half addrspace(1)* %arg_mag,
+  float addrspace(1)* %arg_sign) {
+entry:
+  %mag = load half, half addrspace(1)* %arg_mag
+  %sign = load float, float addrspace(1)* %arg_sign
+  %sign.trunc = fptrunc float %sign to half
+  %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
+  store half %out, half addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
+; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
+; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
+; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]]
+; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
+; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
+; GFX89-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN_HI]]
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
+; GCN: buffer_store_short v[[OUT]]
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64(
+  half addrspace(1)* %arg_out,
+  half addrspace(1)* %arg_mag,
+  double addrspace(1)* %arg_sign) {
+entry:
+  %mag = load half, half addrspace(1)* %arg_mag
+  %sign = load double, double addrspace(1)* %arg_sign
+  %sign.trunc = fptrunc double %sign to half
+  %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
+  store half %out, half addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
+; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
+; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
+; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG_TRUNC]]
+; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
+; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
+; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
+; GFX89-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]]
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_TRUNC]], v[[SIGN]]
+; GCN: buffer_store_short v[[OUT]]
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16(
+  half addrspace(1)* %arg_out,
+  float addrspace(1)* %arg_mag,
+  half addrspace(1)* %arg_sign) {
+entry:
+  %mag = load float, float addrspace(1)* %arg_mag
+  %mag.trunc = fptrunc float %mag to half
+  %sign = load half, half addrspace(1)* %arg_sign
+  %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
+  store half %out, half addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16:
+; GCN: v_bfi_b32
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_out_f16_mag_f64_sign_f16(
+  half addrspace(1)* %arg_out,
+  double addrspace(1)* %arg_mag,
+  half addrspace(1)* %arg_sign) {
+entry:
+  %mag = load double, double addrspace(1)* %arg_mag
+  %mag.trunc = fptrunc double %mag to half
+  %sign = load half, half addrspace(1)* %arg_sign
+  %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
+  store half %out, half addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_v2f16:
+; GCN: v_bfi_b32
+; GCN: v_bfi_b32
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_v2f16(
+  <2 x half> addrspace(1)* %arg_out,
+  <2 x half> %arg_mag,
+  <2 x half> %arg_sign) {
+entry:
+  %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
+  store <2 x half> %out, <2 x half> addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_v3f16:
+; GCN: v_bfi_b32
+; GCN: v_bfi_b32
+; GCN: v_bfi_b32
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_v3f16(
+  <3 x half> addrspace(1)* %arg_out,
+  <3 x half> %arg_mag,
+  <3 x half> %arg_sign) {
+entry:
+  %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
+  store <3 x half> %out, <3 x half> addrspace(1)* %arg_out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_copysign_v4f16:
+; GCN: v_bfi_b32
+; GCN: v_bfi_b32
+; GCN: v_bfi_b32
+; GCN: v_bfi_b32
+; GCN: s_endpgm
+define amdgpu_kernel void @test_copysign_v4f16(
+  <4 x half> addrspace(1)* %arg_out,
+  <4 x half> %arg_mag,
+  <4 x half> %arg_sign) {
+entry:
+  %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
+  store <4 x half> %out, <4 x half> addrspace(1)* %arg_out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcopysign.f32.ll b/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 632de18dafcb..e5893e5995a3 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -20,7 +20,7 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
 ; GCN: s_endpgm
 
 ; EG: BFI_INT
-define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
   %result = call float @llvm.copysign.f32(float %mag, float %sign)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@ define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign
 
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
   store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@ define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %ma
 ; EG: BFI_INT
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
   %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 12c942beee6c..67779a8ff3b9 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -17,7 +17,7 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -32,7 +32,7 @@ define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %s
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
   %c = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %c)
   store double %result, double addrspace(1)* %out, align 8
@@ -41,7 +41,7 @@ define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f64:
 ; GCN: s_endpgm
-define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
   ret void
@@ -49,7 +49,7 @@ define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f64:
 ; GCN: s_endpgm
-define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/fdiv.f16.ll b/test/CodeGen/AMDGPU/fdiv.f16.ll
index 70b70bdaaaa7..7f84e973c958 100644
--- a/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -31,7 +31,7 @@
 ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
 ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16(
+define amdgpu_kernel void @v_fdiv_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) #0 {
@@ -54,7 +54,7 @@ entry:
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -72,7 +72,7 @@ entry:
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -91,7 +91,7 @@ entry:
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -109,7 +109,7 @@ entry:
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -127,7 +127,7 @@ entry:
 ; VI: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -147,7 +147,7 @@ entry:
 ; VI-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -168,7 +168,7 @@ entry:
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -190,7 +190,7 @@ entry:
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
+define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -204,6 +204,42 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
+
+; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
+; VI: buffer_store_short [[MUL]]
+define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
+  %x = load half, half addrspace(1)* undef
+  %rcp = fdiv arcp half %x, 2.0
+  store half %rcp, half addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}
+
+; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
+; VI: buffer_store_short [[MUL]]
+define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
+  %x = load half, half addrspace(1)* undef
+  %rcp = fdiv arcp half %x, 10.0
+  store half %rcp, half addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}
+
+; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
+; VI: buffer_store_short [[MUL]]
+define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
+  %x = load half, half addrspace(1)* undef
+  %rcp = fdiv arcp half %x, -10.0
+  store half %rcp, half addrspace(1)* %out, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare half @llvm.sqrt.f16(half) #1
 declare half @llvm.fabs.f16(half) #1
diff --git a/test/CodeGen/AMDGPU/fdiv.f64.ll b/test/CodeGen/AMDGPU/fdiv.f64.ll
index 20f9e4df07fd..d16bdf43ee26 100644
--- a/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
 
 
-; COMMON-LABEL: {{^}}fdiv_f64:
-; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0
-; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-LABEL: {{^}}fdiv_f64:
+; GCN-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0
+; GCN-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
 ; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]]
 
@@ -13,23 +13,23 @@
 ; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
 ; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]]
 
-; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]]
+; GCN-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]]
 
 ; SI-DAG: v_cmp_eq_u32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-DAG: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc
 
-; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0
-; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]]
-; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0
-; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]]
-; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]]
-; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]]
-; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]]
-; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
-; COMMON: buffer_store_dwordx2 [[RESULT]]
-; COMMON: s_endpgm
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind {
+; GCN-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0
+; GCN-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]]
+; GCN-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0
+; GCN-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]]
+; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]]
+; GCN-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]]
+; GCN: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]]
+; GCN: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+; GCN: s_endpgm
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
   %num = load volatile double, double addrspace(1)* %in
   %den = load volatile double, double addrspace(1)* %gep.1
@@ -38,31 +38,31 @@ define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounw
   ret void
 }
 
-; COMMON-LABEL: {{^}}fdiv_f64_s_v:
-define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind {
+; GCN-LABEL: {{^}}fdiv_f64_s_v:
+define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
   %den = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
 }
 
-; COMMON-LABEL: {{^}}fdiv_f64_v_s:
-define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind {
+; GCN-LABEL: {{^}}fdiv_f64_v_s:
+define amdgpu_kernel void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 {
   %num = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
 }
 
-; COMMON-LABEL: {{^}}fdiv_f64_s_s:
-define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind {
+; GCN-LABEL: {{^}}fdiv_f64_s_s:
+define amdgpu_kernel void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 {
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
 }
 
-; COMMON-LABEL: {{^}}v_fdiv_v2f64:
-define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind {
+; GCN-LABEL: {{^}}v_fdiv_v2f64:
+define amdgpu_kernel void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1
   %num = load <2 x double>, <2 x double> addrspace(1)* %in
   %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1
@@ -71,15 +71,15 @@ define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac
   ret void
 }
 
-; COMMON-LABEL: {{^}}s_fdiv_v2f64:
-define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
+; GCN-LABEL: {{^}}s_fdiv_v2f64:
+define amdgpu_kernel void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
   %result = fdiv <2 x double> %num, %den
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
 }
 
-; COMMON-LABEL: {{^}}v_fdiv_v4f64:
-define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind {
+; GCN-LABEL: {{^}}v_fdiv_v4f64:
+define amdgpu_kernel void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
   %num = load <4 x double>, <4 x double> addrspace(1)* %in
   %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1
@@ -88,9 +88,46 @@ define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspac
   ret void
 }
 
-; COMMON-LABEL: {{^}}s_fdiv_v4f64:
-define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) {
+; GCN-LABEL: {{^}}s_fdiv_v4f64:
+define amdgpu_kernel void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 {
   %result = fdiv <4 x double> %num, %den
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 0.5
+; GCN: buffer_store_dwordx2 [[MUL]]
+define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
+  %x = load double, double addrspace(1)* undef
+  %rcp = fdiv fast double %x, 2.0
+  store double %rcp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}div_fast_k_x_pat_f64:
+; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a
+; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fb99999
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 [[MUL]]
+define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
+  %x = load double, double addrspace(1)* undef
+  %rcp = fdiv fast double %x, 10.0
+  store double %rcp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}div_fast_neg_k_x_pat_f64:
+; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a
+; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfb99999
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 [[MUL]]
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
+  %x = load double, double addrspace(1)* undef
+  %rcp = fdiv fast double %x, -10.0
+  store double %rcp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index 0e95de9c555c..b3a2b6643720 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -27,7 +27,7 @@
 ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -52,7 +52,7 @@ entry:
 ; GCN-NOT: s_setreg
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 ; GCN: v_rcp_f32
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
-define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -77,7 +77,7 @@ entry:
 ; GCN: v_fma_f32
 ; GCN: v_div_fmas_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -89,7 +89,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -104,7 +104,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -119,7 +119,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv arcp float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -136,7 +136,7 @@ entry:
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -146,7 +146,7 @@ entry:
 ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
 ; GCN: v_cmp_gt_f32
 ; GCN: v_cmp_gt_f32
-define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -161,7 +161,7 @@ entry:
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv fast <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -176,7 +176,7 @@ entry:
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -197,7 +197,7 @@ entry:
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -220,7 +220,7 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -243,7 +243,7 @@ define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> ad
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/AMDGPU/ffloor.f64.ll b/test/CodeGen/AMDGPU/ffloor.f64.ll
index 83ffbdfa23a5..407cccb8443e 100644
--- a/test/CodeGen/AMDGPU/ffloor.f64.ll
+++ b/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -19,7 +19,7 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.floor.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @ffloor_f64(double addrspace(1)* %out, double %x) {
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
 ; SI: s_endpgm
-define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
   %neg = fsub double 0.0, %x
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
   store double %y, double addrspace(1)* %out
@@ -50,7 +50,7 @@ define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
 ; SI: s_endpgm
-define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
   %abs = call double @llvm.fabs.f64(double %x)
   %neg = fsub double 0.0, %abs
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
@@ -61,7 +61,7 @@ define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}ffloor_v2f64:
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -72,7 +72,7 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI-NOT: v_floor_f64_e32
-define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+define amdgpu_kernel void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
   store <3 x double> %y, <3 x double> addrspace(1)* %out
   ret void
@@ -83,7 +83,7 @@ define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -98,7 +98,7 @@ define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -121,7 +121,7 @@ define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/ffloor.ll b/test/CodeGen/AMDGPU/ffloor.ll
index d7f35a45075c..720fe7a45e3d 100644
--- a/test/CodeGen/AMDGPU/ffloor.ll
+++ b/test/CodeGen/AMDGPU/ffloor.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}floor_f32:
 ; SI: v_floor_f32_e32
 ; R600: FLOOR
-define void @floor_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @floor_f32(float addrspace(1)* %out, float %in) {
   %tmp = call float @llvm.floor.f32(float %in) #0
   store float %tmp, float addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @floor_f32(float addrspace(1)* %out, float %in) {
 ; SI: v_floor_f32_e32
 ; SI: v_floor_f32_e32
 
-define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
   store <2 x float> %tmp, <2 x float> addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; R600: FLOOR
 ; R600: FLOOR
 ; R600: FLOOR
-define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
   store <4 x float> %tmp, <4 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fix-vgpr-copies.mir b/test/CodeGen/AMDGPU/fix-vgpr-copies.mir
new file mode 100644
index 000000000000..4951e0df4d3e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fix-vgpr-copies.mir
@@ -0,0 +1,44 @@
+# RUN: llc -march=amdgcn -start-after=greedy -stop-after=si-optimize-exec-masking -o - %s | FileCheck %s
+# Check that we first do all vector instructions and only then change exec
+# CHECK-DAG:  COPY %vgpr10_vgpr11
+# CHECK-DAG:  COPY %vgpr12_vgpr13
+# CHECK:      %exec = COPY
+
+---
+name:            main
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+  - { reg: '%sgpr4_sgpr5' }
+  - { reg: '%sgpr6' }
+  - { reg: '%vgpr0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    liveins: %vgpr3, %vgpr10_vgpr11, %vgpr12_vgpr13
+
+    %vcc = V_CMP_NE_U32_e64 0, killed %vgpr3, implicit %exec
+    %sgpr4_sgpr5 = COPY %exec, implicit-def %exec
+    %sgpr6_sgpr7 = S_AND_B64 %sgpr4_sgpr5, killed %vcc, implicit-def dead %scc
+    %sgpr4_sgpr5 = S_XOR_B64 %sgpr6_sgpr7, killed %sgpr4_sgpr5, implicit-def dead %scc
+    %vgpr61_vgpr62 = COPY %vgpr10_vgpr11
+    %vgpr155_vgpr156 = COPY %vgpr12_vgpr13
+    %exec = S_MOV_B64_term killed %sgpr6_sgpr7
+...
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
index 55b5482d031f..c867e4fca229 100644
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -17,43 +17,43 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
-define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  store volatile i32 %x, i32 addrspace(4)* %fptr, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_flat_i64:
 ; CHECK: flat_store_dwordx2
-define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
-  store i64 %x, i64 addrspace(4)* %fptr, align 8
+  store volatile i64 %x, i64 addrspace(4)* %fptr, align 8
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_flat_v4i32:
 ; CHECK: flat_store_dwordx4
-define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
-  store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
+  store volatile <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i16:
 ; CHECK: flat_store_short
-define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %y = trunc i32 %x to i16
-  store i16 %y, i16 addrspace(4)* %fptr, align 2
+  store volatile i16 %y, i16 addrspace(4)* %fptr, align 2
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i8:
 ; CHECK: flat_store_byte
-define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %y = trunc i32 %x to i8
-  store i8 %y, i8 addrspace(4)* %fptr, align 2
+  store volatile i8 %y, i8 addrspace(4)* %fptr, align 2
   ret void
 }
 
@@ -61,36 +61,36 @@ define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
 
 ; CHECK-LABEL: load_flat_i32:
 ; CHECK: flat_load_dword
-define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  %fload = load i32, i32 addrspace(4)* %fptr, align 4
+  %fload = load volatile i32, i32 addrspace(4)* %fptr, align 4
   store i32 %fload, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: load_flat_i64:
 ; CHECK: flat_load_dwordx2
-define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
-  %fload = load i64, i64 addrspace(4)* %fptr, align 8
+  %fload = load volatile i64, i64 addrspace(4)* %fptr, align 8
   store i64 %fload, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; CHECK-LABEL: load_flat_v4i32:
 ; CHECK: flat_load_dwordx4
-define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
-  %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32
+  %fload = load volatile <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32
   store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
 ; CHECK-LABEL: sextload_flat_i8:
 ; CHECK: flat_load_sbyte
-define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
-  %fload = load i8, i8 addrspace(4)* %fptr, align 4
+  %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = sext i8 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -98,9 +98,9 @@ define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
 
 ; CHECK-LABEL: zextload_flat_i8:
 ; CHECK: flat_load_ubyte
-define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
-  %fload = load i8, i8 addrspace(4)* %fptr, align 4
+  %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = zext i8 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -108,9 +108,9 @@ define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
 
 ; CHECK-LABEL: sextload_flat_i16:
 ; CHECK: flat_load_sshort
-define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
-  %fload = load i16, i16 addrspace(4)* %fptr, align 4
+  %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = sext i16 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -118,9 +118,9 @@ define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
 
 ; CHECK-LABEL: zextload_flat_i16:
 ; CHECK: flat_load_ushort
-define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
-  %fload = load i16, i16 addrspace(4)* %fptr, align 4
+  %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = zext i16 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -131,7 +131,7 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
-define void @flat_scratch_unaligned_load() {
+define amdgpu_kernel void @flat_scratch_unaligned_load() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1
@@ -143,7 +143,7 @@ define void @flat_scratch_unaligned_load() {
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
-define void @flat_scratch_unaligned_store() {
+define amdgpu_kernel void @flat_scratch_unaligned_store() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %fptr, align 1
@@ -154,7 +154,7 @@ define void @flat_scratch_unaligned_store() {
 ; HSA: flat_load_dword
 ; HSA: flat_load_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_load() {
+define amdgpu_kernel void @flat_scratch_multidword_load() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   %ld = load volatile <2 x i32>, <2 x i32> addrspace(4)* %fptr
@@ -165,7 +165,7 @@ define void @flat_scratch_multidword_load() {
 ; HSA: flat_store_dword
 ; HSA: flat_store_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_store() {
+define amdgpu_kernel void @flat_scratch_multidword_store() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   store volatile <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %fptr
diff --git a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index df9ba00c6974..dac1500cd46c 100644
--- a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -23,7 +23,7 @@
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@ entry:
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test_addr64(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_addr64(i32 addrspace(1)* %out) {
 entry:
   %out.addr = alloca i32 addrspace(1)*, align 4
 
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index b71c8bcb76c7..23f40daf3d23 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -19,7 +19,7 @@
 ; CI: ; NumSgprs: 8
 ; VI-NOXNACK: ; NumSgprs: 8
 ; VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_no_flat() {
+define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7}"()
   ret void
@@ -33,7 +33,7 @@ entry:
 ; CI: ; NumSgprs: 10
 ; VI-NOXNACK: ; NumSgprs: 10
 ; VI-XNACK: ; NumSgprs: 12
-define void @vcc_no_flat() {
+define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC}"()
   ret void
@@ -50,7 +50,7 @@ entry:
 ; HSA-CI: ; NumSgprs: 8
 ; HSA-VI-NOXNACK: ; NumSgprs: 8
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_flat() {
+define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
   ret void
@@ -66,7 +66,7 @@ entry:
 ; HSA-CI: ; NumSgprs: 10
 ; HSA-VI-NOXNACK: ; NumSgprs: 10
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @vcc_flat() {
+define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
   ret void
diff --git a/test/CodeGen/AMDGPU/flat_atomics.ll b/test/CodeGen/AMDGPU/flat_atomics.ll
index 7400dbcf8909..cc95d80570e0 100644
--- a/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -23,7 +23,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -34,7 +34,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -45,7 +45,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -54,7 +54,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -63,7 +63,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -73,7 +73,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -83,7 +83,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -93,7 +93,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -103,7 +103,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -114,7 +114,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -125,7 +125,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -134,7 +134,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -143,7 +143,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -163,7 +163,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -173,7 +173,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -183,7 +183,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -194,7 +194,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -205,7 +205,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -214,7 +214,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -223,7 +223,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -233,7 +233,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -243,7 +243,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -253,7 +253,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -263,7 +263,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -274,7 +274,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -285,7 +285,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -294,7 +294,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -303,7 +303,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -313,7 +313,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -323,7 +323,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -333,7 +333,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -343,7 +343,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -354,7 +354,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -365,7 +365,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -374,7 +374,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -393,7 +393,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -403,7 +403,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -434,7 +434,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -445,7 +445,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -454,7 +454,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -463,7 +463,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -473,7 +473,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -483,7 +483,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -493,7 +493,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -503,7 +503,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -514,7 +514,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -525,16 +525,16 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
 }
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret:
-; GCN: flat_atomic_umin v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -543,7 +543,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -553,7 +553,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}}
-  define void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+  define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -563,7 +563,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -573,7 +573,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -583,7 +583,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -594,7 +594,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -605,7 +605,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -614,7 +614,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -623,7 +623,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -633,7 +633,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -643,7 +643,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -653,7 +653,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -663,7 +663,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -674,7 +674,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -685,7 +685,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32:
 ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -694,7 +694,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -703,7 +703,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -713,7 +713,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -725,7 +725,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -735,7 +735,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -746,7 +746,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -757,7 +757,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -769,7 +769,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -778,7 +778,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
@@ -788,7 +788,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -798,7 +798,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -809,7 +809,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -819,7 +819,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -829,7 +829,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -840,7 +840,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -851,7 +851,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -860,7 +860,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -869,7 +869,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -879,7 +879,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -890,7 +890,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4
   %val = load atomic i32, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -901,7 +901,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4
   store i32 %val, i32 addrspace(4)* %out
@@ -911,7 +911,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -923,7 +923,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4
@@ -933,7 +933,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   store atomic i32 %in, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -942,7 +942,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4
   ret void
@@ -950,7 +950,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -960,7 +960,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4
diff --git a/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 0bd6c2dd5b86..723dde9ab68f 100644
--- a/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -23,7 +23,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -34,7 +34,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -45,7 +45,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -54,7 +54,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -63,7 +63,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -73,7 +73,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -83,7 +83,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -93,7 +93,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -103,7 +103,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -114,7 +114,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -125,7 +125,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -134,7 +134,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -143,7 +143,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -163,7 +163,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -173,7 +173,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -183,7 +183,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -194,7 +194,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -205,7 +205,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -214,7 +214,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -223,7 +223,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -233,7 +233,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -243,7 +243,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -253,7 +253,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -263,7 +263,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -274,7 +274,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -285,7 +285,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -294,7 +294,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -303,7 +303,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -313,7 +313,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -323,7 +323,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -333,7 +333,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -343,7 +343,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -354,7 +354,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -365,7 +365,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -374,7 +374,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -393,7 +393,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -403,7 +403,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -434,7 +434,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -445,7 +445,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -454,7 +454,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -463,7 +463,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -473,7 +473,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -483,7 +483,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -493,7 +493,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -503,7 +503,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -514,7 +514,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -525,7 +525,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -534,7 +534,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -543,7 +543,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -553,7 +553,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -563,7 +563,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -573,7 +573,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -583,7 +583,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -594,7 +594,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -605,7 +605,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -614,7 +614,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -623,7 +623,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -633,7 +633,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -643,7 +643,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -653,7 +653,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -663,7 +663,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -674,7 +674,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -685,7 +685,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -694,7 +694,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -703,7 +703,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -713,7 +713,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -723,7 +723,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -733,7 +733,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -743,7 +743,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -754,7 +754,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -765,7 +765,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -774,7 +774,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -783,7 +783,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -793,7 +793,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -804,7 +804,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4
   %val = load atomic i64, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -815,7 +815,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8
   store i64 %val, i64 addrspace(4)* %out
@@ -825,7 +825,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -837,7 +837,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -847,7 +847,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_offset:
 ; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   store atomic i64 %in, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -856,7 +856,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
-define void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8
   ret void
@@ -864,7 +864,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -874,7 +874,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -883,7 +883,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -892,7 +892,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -902,7 +902,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -913,7 +913,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -924,7 +924,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -936,7 +936,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
@@ -945,7 +945,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -955,7 +955,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -965,7 +965,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
index 50c5a5abf7fa..4113ba8dc1f0 100644
--- a/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -18,7 +18,7 @@ declare float @llvm.fma.f32(float, float, float) #0
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -46,7 +46,7 @@ define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addr
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -75,7 +75,7 @@ define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -99,7 +99,7 @@ define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addr
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -127,7 +127,7 @@ define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@ define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, d
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -184,7 +184,7 @@ define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -213,7 +213,7 @@ define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, d
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -242,7 +242,7 @@ define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -276,7 +276,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %o
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -317,7 +317,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %o
 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -358,7 +358,7 @@ define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %
 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -390,7 +390,7 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -406,7 +406,7 @@ define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -422,7 +422,7 @@ define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -438,7 +438,7 @@ define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -454,7 +454,7 @@ define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -470,7 +470,7 @@ define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -486,7 +486,7 @@ define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -502,7 +502,7 @@ define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -518,7 +518,7 @@ define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -534,7 +534,7 @@ define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
                                       float addrspace(1)* %in1,
                                       float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -550,7 +550,7 @@ define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -566,7 +566,7 @@ define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -588,7 +588,7 @@ define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
 ;
 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
-define void @test_f32_interp(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
                              float addrspace(1)* %in1,
                              float addrspace(1)* %in2,
                              float addrspace(1)* %in3) {
@@ -610,7 +610,7 @@ define void @test_f32_interp(float addrspace(1)* %out,
 ;
 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
-define void @test_f64_interp(double addrspace(1)* %out,
+define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
                              double addrspace(1)* %in1,
                              double addrspace(1)* %in2,
                              double addrspace(1)* %in3) {
diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll
index cf6d7d824992..4d3f3712621e 100644
--- a/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/test/CodeGen/AMDGPU/fma.f64.ll
@@ -8,7 +8,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) n
 
 ; FUNC-LABEL: {{^}}fma_f64:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -21,7 +21,7 @@ define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; FUNC-LABEL: {{^}}fma_v2f64:
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                        <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -36,7 +36,7 @@ define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1
 ; SI: v_fma_f64
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                        <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll
index d04a5946b98c..659cecb59ebf 100644
--- a/test/CodeGen/AMDGPU/fma.ll
+++ b/test/CodeGen/AMDGPU/fma.ll
@@ -12,7 +12,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG: FMA {{\*? *}}[[RES]]
-define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -29,7 +29,7 @@ define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
-define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                        <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1
   %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2
@@ -50,7 +50,7 @@ define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)*
 ; EG-DAG: FMA {{\*? *}}[[RES]].Y
 ; EG-DAG: FMA {{\*? *}}[[RES]].Z
 ; EG-DAG: FMA {{\*? *}}[[RES]].W
-define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                        <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1
   %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2
@@ -62,7 +62,7 @@ define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)*
 
 ; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
 ; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}}
-define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -77,7 +77,7 @@ define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, fl
 }
 
 ; FUNC-LABEL: @fma_commute_mul_s_f32
-define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll
index 4d42a4630e22..8b9104b79e7f 100644
--- a/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -11,7 +11,7 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone
 ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
   %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
   %a = load volatile double, double addrspace(1)* %aptr, align 8
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index 7c01ca85f6b9..a96eb5db9e2a 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -10,7 +10,7 @@ declare float @llvm.maxnum.f32(float, float) nounwind readnone
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile  float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -28,7 +28,7 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index da498caa6b54..083346e9d1cb 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -4,7 +4,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmax_legacy_uge_f64
-define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -19,7 +19,7 @@ define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_oge_f64
-define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -34,7 +34,7 @@ define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ugt_f64
-define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -49,7 +49,7 @@ define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ogt_f64
-define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
index 4a4c92a38a35..7643c3ea533c 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 
 ; EG: MAX
-define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -33,7 +33,7 @@ define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -53,7 +53,7 @@ define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -73,7 +73,7 @@ define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -93,7 +93,7 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -114,7 +114,7 @@ define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x flo
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
-define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -137,7 +137,7 @@ define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x flo
 ; SI-NOT: v_max_
 
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
index fec3a358a4fa..20af278bf98c 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.f64.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
@@ -9,7 +9,7 @@ declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0
 
 ; FUNC-LABEL: @test_fmax_f64
 ; SI: v_max_f64
-define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.maxnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@ define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) noun
 ; FUNC-LABEL: @test_fmax_v2f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@ define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@ define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@ define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void
diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll
index 4058247a6da9..277b8ce04c4e 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -14,7 +14,7 @@ declare double @llvm.maxnum.f64(double, double)
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@ define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -70,7 +70,7 @@ define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
-define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -114,7 +114,7 @@ define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
-define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -128,7 +128,7 @@ define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@ define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344(nan)
-define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -171,7 +171,7 @@ define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -185,7 +185,7 @@ define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -199,7 +199,7 @@ define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -213,7 +213,7 @@ define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -227,7 +227,7 @@ define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@ define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -250,7 +250,7 @@ define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -262,7 +262,7 @@ define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -274,7 +274,7 @@ define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll
index e66678069130..d2cfc713ed37 100644
--- a/test/CodeGen/AMDGPU/fmed3.ll
+++ b/test/CodeGen/AMDGPU/fmed3.ll
@@ -1,18 +1,33 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+
+
+; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
+; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %a.add = fadd nnan float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
 
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-declare float @llvm.minnum.f32(float, float) #0
-declare float @llvm.maxnum.f32(float, float) #0
-declare double @llvm.minnum.f64(double, double) #0
-declare double @llvm.maxnum.f64(double, double) #0
+  store float %med, float addrspace(1)* %outgep
+  ret void
+}
 
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f32:
 ; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -30,7 +45,7 @@ define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -48,7 +63,7 @@ define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float add
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -64,7 +79,7 @@ define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float add
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -81,7 +96,7 @@ define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, flo
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -98,7 +113,7 @@ define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float ad
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
 ; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
 ; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
@@ -113,7 +128,7 @@ define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(
 
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -131,7 +146,7 @@ define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addr
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -149,6 +164,812 @@ define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrs
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %a.fneg = fsub float -0.0, %a
+  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod1:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %b.fneg = fsub float -0.0, %b
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod2:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %c.fneg = fsub float -0.0, %c
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod012:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]|
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.fneg = fsub float -0.0, %a
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %c.fabs.fneg = fsub float -0.0, %c.fabs
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_negabs012:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]|
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %a.fabs.fneg = fsub float -0.0, %a.fabs
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %b.fabs.fneg = fsub float -0.0, %b.fabs
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %c.fabs.fneg = fsub float -0.0, %c.fabs
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_med3_f32_pat0:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
+; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
+; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+  %c.nnan = fadd nnan float %c, 4.0
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; 16 combinations
+
+; 0: max(min(x, y), min(max(x, y), z))
+; 1: max(min(x, y), min(max(y, x), z))
+; 2: max(min(x, y), min(z, max(x, y)))
+; 3: max(min(x, y), min(z, max(y, x)))
+; 4: max(min(y, x), min(max(x, y), z))
+; 5: max(min(y, x), min(max(y, x), z))
+; 6: max(min(y, x), min(z, max(x, y)))
+; 7: max(min(y, x), min(z, max(y, x)))
+;
+; + commute outermost max
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat1:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat2:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat3:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat4:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat5:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat6:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat7:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat8:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat9:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat10:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat11:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat12:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat13:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat14:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat15:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; ---------------------------------------------------------------------
+; Negative patterns
+; ---------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0:
+; GCN-DAG: v_min_f32
+; GCN-DAG: v_max_f32
+; GCN: v_min_f32
+; GCN: v_max_f32
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  store volatile float %tmp0, float addrspace(1)* undef
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1:
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  store volatile float %tmp1, float addrspace(1)* undef
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2:
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  store volatile float %tmp2, float addrspace(1)* undef
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0:
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0:
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.nnan = fadd float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+  %c.nnan = fadd nnan float %c, 4.0
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0:
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd float %b, 2.0
+  %c.nnan = fadd nnan float %c, 4.0
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0:
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+  %c.nnan = fadd float %c, 4.0
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_min_f32
+; GCN: v_max_f32
+; GCN: v_min_f32
+; GCN: v_max_f32
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %a.fneg = fsub float -0.0, %a
+  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; A simple min and max is not sufficient
+; GCN-LABEL: {{^}}v_test_global_nnans_min_max_f32:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]]
+; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]]
+define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %minmax = call float @llvm.minnum.f32(float %max, float %c)
+  store float %minmax, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16:
+; SI: v_cvt_f32_f16
+; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+; SI: v_cvt_f16_f32
+
+; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0
+; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0
+; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0
+
+; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0
+; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %a.add = fadd nnan half %a, 1.0
+  %max = call half @llvm.maxnum.f16(half %a.add, half 2.0)
+  %med = call half @llvm.minnum.f16(half %max, half 4.0)
+
+  store half %med, half addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0:
+; GCN: {{buffer_|flat_}}load_ushort [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_ushort [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_ushort [[C:v[0-9]+]]
+
+; SI: v_cvt_f32_f16
+; SI: v_cvt_f32_f16
+; SI: v_add_f32_e32
+; SI: v_add_f32_e32
+; SI: v_add_f32_e32
+; SI: v_med3_f32
+; SI: v_cvt_f16_f32_e32
+
+
+; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
+; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
+; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
+
+; VI-DAG: v_min_f16
+; VI-DAG: v_max_f16
+; VI: v_min_f16
+; VI: v_max_f16
+
+; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load volatile half, half addrspace(1)* %gep0
+  %b = load volatile half, half addrspace(1)* %gep1
+  %c = load volatile half, half addrspace(1)* %gep2
+
+  %a.nnan = fadd nnan half %a, 1.0
+  %b.nnan = fadd nnan half %b, 2.0
+  %c.nnan = fadd nnan half %c, 4.0
+
+  %tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan)
+  %tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan)
+  %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan)
+  %med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
+  store half %med3, half addrspace(1)* %outgep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare double @llvm.minnum.f64(double, double) #0
+declare double @llvm.maxnum.f64(double, double) #0
+declare half @llvm.fabs.f16(half) #0
+declare half @llvm.minnum.f16(half, half) #0
+declare half @llvm.maxnum.f16(half, half) #0
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
 attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index 3102ffdbdd28..3183f77f090b 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -11,7 +11,7 @@ declare float @llvm.minnum.f32(float, float) nounwind readnone
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -29,7 +29,7 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
diff --git a/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
new file mode 100644
index 000000000000..fdfe533b3d0c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NONAN -check-prefix=GCN %s
+
+; FIXME: Should replace unsafe-fp-math with no signed zeros.
+
+; GCN-LABEL: {{^}}min_fneg_select_regression_0:
+; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, -1.0
+define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ult float %a, 1.0
+  %min.a = select i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
+; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
+define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ult float %a, -1.0
+  %min.a = select i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}max_fneg_select_regression_0:
+; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
+define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ugt float %a, 1.0
+  %min.a = select i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
+; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
+define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ugt float %a, -1.0
+  %min.a = select i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index 6982ee0c0cb3..99bc114831ca 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -3,7 +3,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmin_legacy_f64
-define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
+define amdgpu_kernel void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
    %r0 = extractelement <4 x double> %reg0, i32 0
    %r1 = extractelement <4 x double> %reg0, i32 1
    %r2 = fcmp uge double %r0, %r1
@@ -14,7 +14,7 @@ define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double>
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ule_f64
-define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -29,7 +29,7 @@ define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ole_f64
-define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -44,7 +44,7 @@ define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_olt_f64
-define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -59,7 +59,7 @@ define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ult_f64
-define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index 79acd02e6d1f..52336f95a909 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; EG: MIN *
 ; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
@@ -34,7 +34,7 @@ define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
 
-define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
   store float %val, float addrspace(1)* %out, align 4
@@ -46,7 +46,7 @@ define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, floa
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -65,7 +65,7 @@ define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -84,7 +84,7 @@ define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -103,7 +103,7 @@ define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -122,7 +122,7 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -144,7 +144,7 @@ define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x flo
 
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1
@@ -166,7 +166,7 @@ define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x flo
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -188,7 +188,7 @@ define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x flo
 ; SI-NEXT: v_cndmask_b32
 ; SI-NOT: v_min
 ; SI: s_endpgm
-define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fminnum.f64.ll b/test/CodeGen/AMDGPU/fminnum.f64.ll
index 0f929d6a81f0..01b267411212 100644
--- a/test/CodeGen/AMDGPU/fminnum.f64.ll
+++ b/test/CodeGen/AMDGPU/fminnum.f64.ll
@@ -9,7 +9,7 @@ declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
 
 ; FUNC-LABEL: @test_fmin_f64
 ; SI: v_min_f64
-define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@ define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) noun
 ; FUNC-LABEL: @test_fmin_v2f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@ define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@ define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@ define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void
diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll
index abd2b9d3e4d1..9e997c7a1045 100644
--- a/test/CodeGen/AMDGPU/fminnum.ll
+++ b/test/CodeGen/AMDGPU/fminnum.ll
@@ -13,7 +13,7 @@ declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@ define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -43,7 +43,7 @@ define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -69,7 +69,7 @@ define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
-define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -113,7 +113,7 @@ define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
-define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -127,7 +127,7 @@ define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -142,7 +142,7 @@ define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
-define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -156,7 +156,7 @@ define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -170,7 +170,7 @@ define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@ define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -198,7 +198,7 @@ define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -212,7 +212,7 @@ define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -226,7 +226,7 @@ define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -237,7 +237,7 @@ define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -248,7 +248,7 @@ define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -260,7 +260,7 @@ define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -272,7 +272,7 @@ define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 8663a2129fc0..4002712ab169 100644
--- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -1,5 +1,7 @@
 ; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
+
 
 ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
 ; make add an instruction if the fadd has more than one use.
@@ -19,9 +21,9 @@ declare float @llvm.fabs.f32(float) #1
 ; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_cndmask_b32_e32
 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
-; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %a11 = fadd fast float %y, -1.0
   %a12 = call float @llvm.fabs.f32(float %a11)
   %a13 = fadd fast float %x, -1.0
@@ -42,7 +44,7 @@ define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, floa
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %mad = fadd fast float %mul2, %y
@@ -57,7 +59,7 @@ define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, floa
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -70,7 +72,7 @@ define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, floa
 ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32:
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
-define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -85,7 +87,7 @@ define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x
 ; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -2.0
@@ -99,7 +101,7 @@ define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
 ; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -3.0
@@ -114,9 +116,10 @@ define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
 ; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_cndmask_b32_e32
 ; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
-; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -136,11 +139,14 @@ define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.
 
 ; GCN-LABEL: {{^}}multiple_use_fadd_fmac_f16:
 ; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], [[X:s[0-9]+]], s{{[0-9]+}}
-; GCN-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0
+
+; VI-FLUSH-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0
+; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, v{{[0-9]+}}
+
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -153,11 +159,14 @@ define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.
 
 ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f16:
 ; GCN-DAG: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}|
-; GCN-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
+
+; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
+; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
+
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -170,9 +179,13 @@ define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.
 }
 
 ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f16:
-; GCN: v_mad_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
-; GCN: v_mad_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
-define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+; VI-FLUSH: v_mad_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
+; VI-FLUSH: v_mad_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
+
+; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
+; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
+
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -190,7 +203,7 @@ define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroe
 ; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -206,7 +219,7 @@ define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 ze
 ; GCN: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll
index 4f47d2c8e755..4e96091ae256 100644
--- a/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -11,7 +11,7 @@
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16(
+define amdgpu_kernel void @fmul_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -25,14 +25,13 @@ entry:
 
 ; GCN-LABEL: {{^}}fmul_f16_imm_a
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
+; SI:  v_mul_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_a(
+define amdgpu_kernel void @fmul_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -44,14 +43,14 @@ entry:
 
 ; GCN-LABEL: {{^}}fmul_f16_imm_b
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI:  v_mul_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
+
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_b(
+define amdgpu_kernel void @fmul_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -61,27 +60,30 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_v2f16
+; GCN-LABEL: {{^}}fmul_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16(
+define amdgpu_kernel void @fmul_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -93,25 +95,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_v2f16_imm_a
+; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
+; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; VI:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_a(
+define amdgpu_kernel void @fmul_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -121,25 +120,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_v2f16_imm_b
+; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
+; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-; VI:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_b(
+define amdgpu_kernel void @fmul_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fmul.ll b/test/CodeGen/AMDGPU/fmul.ll
index d0c39b539456..125de7aabfd4 100644
--- a/test/CodeGen/AMDGPU/fmul.ll
+++ b/test/CodeGen/AMDGPU/fmul.ll
@@ -6,24 +6,20 @@
 ; GCN: v_mul_f32
 
 ; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fmul float %a, %b
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-declare float @llvm.r600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 ; FUNC-LABEL: {{^}}fmul_v2f32:
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -40,7 +36,7 @@ entry:
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -53,7 +49,7 @@ define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 2.0
   %z = fmul float %y, 3.0
   store float %z, float addrspace(1)* %out
@@ -65,7 +61,7 @@ define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
 ; GCN-NOT: v_mul_f32
 ; GCN-NOT: v_mad_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 3.0
   %z = fmul float %y, 2.0
   store float %z, float addrspace(1)* %out
@@ -79,7 +75,7 @@ define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
-define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
   %a = fmul float %x, 5.0
   %b = fsub float -0.0, %a
   %c = fmul float %b, %y
diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll
index 3c222eaba89d..f14233f267b2 100644
--- a/test/CodeGen/AMDGPU/fmul64.ll
+++ b/test/CodeGen/AMDGPU/fmul64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fmul_f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@ define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; FUNC-LABEL: {{^}}fmul_v2f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -29,7 +29,7 @@ define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                         <4 x double> addrspace(1)* %in2) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
diff --git a/test/CodeGen/AMDGPU/fmuladd.f16.ll b/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 500b00bdcf87..9b713419e747 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
 
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare half @llvm.fmuladd.f16(half, half, half) #1
@@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #1
 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
                          half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
   %r0 = load half, half addrspace(1)* %in1
   %r1 = load half, half addrspace(1)* %in2
@@ -34,7 +34,7 @@ define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -56,7 +56,7 @@ define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -82,7 +82,7 @@ define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -111,7 +111,7 @@ define void @fadd_a_a_b_f16(half addrspace(1)* %out,
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -134,7 +134,7 @@ define void @fadd_b_a_a_f16(half addrspace(1)* %out,
 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
 ; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@ define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)*
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -180,7 +180,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -202,7 +202,7 @@ define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)*
 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -231,7 +231,7 @@ define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)*
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -261,7 +261,7 @@ define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspa
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -291,7 +291,7 @@ define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half add
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -323,7 +323,7 @@ define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half ad
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -355,7 +355,7 @@ define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, hal
 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -388,7 +388,7 @@ define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half add
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -419,7 +419,7 @@ define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half ad
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -447,7 +447,7 @@ define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll
index e4b1053ff25c..fb605dd2e4bd 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -25,7 +25,7 @@ declare float @llvm.fabs.f32(float) #1
 
 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                          float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -45,7 +45,7 @@ define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 
 ; GCN-DENORM-STRICT: v_mul_f32_e32
 ; GCN-DENORM-STRICT: v_add_f32_e32
-define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                            float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load volatile float, float addrspace(1)* %in1
   %r1 = load volatile float, float addrspace(1)* %in2
@@ -71,7 +71,7 @@ define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 
 ; SI-DENORM buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -100,7 +100,7 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -132,7 +132,7 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -167,7 +167,7 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out,
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -196,7 +196,7 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -225,7 +225,7 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -256,7 +256,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -286,7 +286,7 @@ define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -318,7 +318,7 @@ define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -353,7 +353,7 @@ define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrs
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -387,7 +387,7 @@ define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float a
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -422,7 +422,7 @@ define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -460,7 +460,7 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -496,7 +496,7 @@ define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float a
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -532,7 +532,7 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -563,7 +563,7 @@ define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll
index f5e64b3c5941..86e91e04b0fc 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -7,7 +7,7 @@
 
 ; GCN-LABEL: {{^}}fmuladd_f64:
 ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                          double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -22,7 +22,7 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                            double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -44,7 +44,7 @@ define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -72,7 +72,7 @@ define void @fadd_a_a_b_f64(double addrspace(1)* %out,
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -94,7 +94,7 @@ define void @fadd_b_a_a_f64(double addrspace(1)* %out,
 ; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
-define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
@@ -117,7 +117,7 @@ define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double add
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -139,7 +139,7 @@ define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -158,7 +158,7 @@ define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast:
 ; GCN: v_fma_f64
-define void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
                                  double addrspace(1)* %in1,
                                 double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
new file mode 100644
index 000000000000..bdd3c04fd318
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
+
+; GCN-LABEL: {{^}}fmuladd_v2f16:
+; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
+                         <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
+  %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
+  %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
+  %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3
+  %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2)
+  store <2 x half> %r3, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16:
+; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
+; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
+; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
+
+; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+
+  %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
+  %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
+
+  %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> <half 2.0, half 2.0>, <2 x half> %r1, <2 x half> %r2)
+  store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmuladd_a_2.0_b_v2f16:
+; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
+; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
+; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
+
+; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+
+  %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
+  %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
+
+  %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r1, <2 x half> <half 2.0, half 2.0>, <2 x half> %r2)
+  store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_a_a_b_v2f16:
+; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
+; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
+; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
+
+; GFX9-DENORM-STRICT: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
+; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
+
+; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
+                            <2 x half> addrspace(1)* %in1,
+                            <2 x half> addrspace(1)* %in2) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+
+  %r0 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
+  %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
+
+  %add.0 = fadd <2 x half> %r0, %r0
+  %add.1 = fadd <2 x half> %add.0, %r1
+  store <2 x half> %add.1, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fnearbyint.ll b/test/CodeGen/AMDGPU/fnearbyint.ll
index 5423fadf81e2..4ff3bbbcbc3e 100644
--- a/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -13,41 +13,41 @@ declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0
 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
 
 
-define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
 entry:
   %0 = call float @llvm.nearbyint.f32(float %in)
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
 entry:
   %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @nearbyint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.nearbyint.f64(double %in)
   store double %0, double addrspace(1)* %out
   ret void
 }
-define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
index 3f9928c2b623..1c0e9a2f13ce 100644
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -enable-unsafe-fp-math -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
 
 ; --------------------------------------------------------------------------------
 ; fadd tests
@@ -14,7 +14,7 @@
 
 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -35,7 +35,7 @@ define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -53,12 +53,16 @@ define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrsp
 ; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
-; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+
+; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+
+; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
+; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[ADD]]
+; GCN: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -83,7 +87,7 @@ define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrsp
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -107,7 +111,7 @@ define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)*
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -131,7 +135,7 @@ define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 
 ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -160,7 +164,7 @@ define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -188,7 +192,7 @@ define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -214,7 +218,7 @@ define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -235,7 +239,7 @@ define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -253,12 +257,11 @@ define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrsp
 ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
-; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
-; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
+; GCN-NEXT: buffer_store_dword [[MUL0]]
+; GCN-NEXT: buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -279,7 +282,7 @@ define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -299,7 +302,7 @@ define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -319,7 +322,7 @@ define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -342,7 +345,7 @@ define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -365,7 +368,7 @@ define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -383,6 +386,300 @@ define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float add
 }
 
 ; --------------------------------------------------------------------------------
+; fminnum tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float %a, float %a)
+  %min.fneg = fsub float -0.0, %min
+  store float %min.fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float -0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.minnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  %mul = fmul float %fneg, %b
+  store float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
+; GCN-NEXT: buffer_store_dword [[MAX0]]
+; GCN-NEXT: buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  %use1 = fmul float %min, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmaxnum tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.maxnum.f32(float %a, float %a)
+  %min.fneg = fsub float -0.0, %min
+  store float %min.fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.maxnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.maxnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  %mul = fmul float %fneg, %b
+  store float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
+; GCN-NEXT: buffer_store_dword [[MAX0]]
+; GCN-NEXT: buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  %use1 = fmul float %min, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
 ; fma tests
 ; --------------------------------------------------------------------------------
 
@@ -396,7 +693,7 @@ define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float add
 
 ; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -420,7 +717,7 @@ define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -441,12 +738,17 @@ define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
+
+; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
+; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
+
+; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
+
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -474,7 +776,7 @@ define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrsp
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -501,7 +803,7 @@ define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -528,7 +830,7 @@ define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -556,7 +858,7 @@ define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspac
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -584,7 +886,7 @@ define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspac
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -614,7 +916,7 @@ define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1
 ; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -644,7 +946,7 @@ define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float a
 ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -677,7 +979,7 @@ define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float a
 
 ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -697,12 +999,17 @@ define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.pt
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_C:v[0-9]+]], 0x80000000, [[C]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
-; GCN-NEXT: buffer_store_dword [[NEG_C]]
+
+; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
+; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
+
+; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
+
+; GCN: buffer_store_dword [[NEG_MAD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -728,7 +1035,7 @@ define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -744,7 +1051,7 @@ define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -763,7 +1070,7 @@ define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float a
 ; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
 ; GCN: buffer_store_dword [[FNEG_A]]
-define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -783,7 +1090,7 @@ define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
-define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -803,7 +1110,7 @@ define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -819,7 +1126,7 @@ define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspac
 
 ; FIXME: Source modifiers not folded for f16->f32
 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -833,7 +1140,7 @@ define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out
 }
 
 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -855,7 +1162,7 @@ define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -871,7 +1178,7 @@ define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspa
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -888,10 +1195,9 @@ define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double ad
 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
 ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
-; GCN-DAG: v_mov_b32_e32 v[[NEG_A_LO:[0-9]+]], v[[A_LO]]
 ; GCN: buffer_store_dword [[RESULT]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[NEG_A_LO]]:[[NEG_A_HI]]{{\]}}
-define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -911,7 +1217,7 @@ define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out,
 ; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dwordx2 [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -930,7 +1236,7 @@ define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out,
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -946,7 +1252,7 @@ define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -965,7 +1271,7 @@ define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addr
 ; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
 ; GCN: buffer_store_dword [[NEG]]
 ; GCN: buffer_store_dword [[CVT]]
-define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -984,7 +1290,7 @@ define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out,
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1004,7 +1310,7 @@ define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out,
 ; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1027,7 +1333,7 @@ define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out,
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1043,7 +1349,7 @@ define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1062,7 +1368,7 @@ define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1082,7 +1388,7 @@ define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrs
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1105,7 +1411,7 @@ define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrs
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1126,7 +1432,7 @@ define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1147,7 +1453,7 @@ define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1170,7 +1476,7 @@ define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out
 ; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1191,7 +1497,7 @@ define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1211,7 +1517,7 @@ define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1231,7 +1537,7 @@ define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1254,7 +1560,7 @@ define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float add
 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1277,7 +1583,7 @@ define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, fl
 ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1300,12 +1606,11 @@ define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, fl
 
 ; GCN-LABEL: {{^}}v_fneg_sin_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
-; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[K]], -[[A]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]]
 ; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
 ; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1321,27 +1626,509 @@ define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
   %sin = call float @llvm.amdgcn.sin.f32(float %a)
-  %fneg = fsub float -0.000000e+00, %sin
+  %fneg = fsub float -0.0, %sin
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
+; --------------------------------------------------------------------------------
+; ftrunc tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_trunc_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %trunc = call float @llvm.trunc.f32(float %a)
+  %fneg = fsub float -0.0, %trunc
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fround tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_round_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_trunc_f32_e32
+; GCN: v_subrev_f32_e32
+; GCN: v_cndmask_b32
+
+; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
+
+; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %round = call float @llvm.round.f32(float %a)
+  %fneg = fsub float -0.0, %round
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; rint tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_rint_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %rint = call float @llvm.rint.f32(float %a)
+  %fneg = fsub float -0.0, %rint
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; nearbyint tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_nearbyint_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %nearbyint = call float @llvm.nearbyint.f32(float %a)
+  %fneg = fsub float -0.0, %nearbyint
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; vintrp tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
+; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
+; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
+define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
+  %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
+  store volatile float %intrp0, float addrspace(1)* %out.gep
+  store volatile float %intrp1, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
+; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
+; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
+define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
+  %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
+  store volatile float %intrp0, float addrspace(1)* %out.gep
+  store volatile float %intrp1, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; CopyToReg tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
+; GCN: s_cbranch_scc1
+
+; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
+; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]]
+; GCN: buffer_store_dword [[MUL1]]
+
+; GCN: buffer_store_dword [[MUL0]]
+define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  %cmp0 = icmp eq i32 %d, 0
+  br i1 %cmp0, label %if, label %endif
+
+if:
+  %mul1 = fmul float %fneg, %c
+  store volatile float %mul1, float addrspace(1)* %out.gep
+  br label %endif
+
+endif:
+  store volatile float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; inlineasm tests
+; --------------------------------------------------------------------------------
+
+; Can't fold into use, so should fold into source
+; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
+; GCN: ; use [[MUL]]
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  call void asm sideeffect "; use $0", "v"(float %fneg) #0
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; inlineasm tests
+; --------------------------------------------------------------------------------
+
+; Can't fold into use, so should fold into source
+; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
+; GCN: ; use [[NEG]]
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  call void asm sideeffect "; use $0", "v"(float %fneg) #0
+  store volatile float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; code size regression tests
+; --------------------------------------------------------------------------------
+
+; There are multiple users of the fneg that must use a VOP3
+; instruction, so there is no penalty
+; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+
+; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
+; GCN-NEXT:	buffer_store_dword [[FMA0]]
+; GCN-NEXT:	buffer_store_dword [[FMA1]]
+define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+
+  %fneg.a = fsub float -0.0, %a
+  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+  %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
+
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %fma1, float addrspace(1)* %out
+  ret void
+}
+
+; There are multiple users, but both require using a larger encoding
+; for the modifier.
+
+; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+
+; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
+; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
+; GCN-NEXT:	buffer_store_dword [[MUL0]]
+; GCN-NEXT:	buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+
+  %fneg.a = fsub float -0.0, %a
+  %mul0 = fmul float %fneg.a, %b
+  %mul1 = fmul float %fneg.a, %c
+
+  store volatile float %mul0, float addrspace(1)* %out
+  store volatile float %mul1, float addrspace(1)* %out
+  ret void
+}
+
+; One user is VOP3 so has no cost to folding the modifier, the other does.
+; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+
+; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
+; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
+
+; GCN:	buffer_store_dword [[FMA0]]
+; GCN-NEXT:	buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+
+  %fneg.a = fsub float -0.0, %a
+  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
+  %mul1 = fmul float %fneg.a, %c
+
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %mul1, float addrspace(1)* %out
+  ret void
+}
+
+; The use of the fneg requires a code size increase, but folding into
+; the source does not
+
+; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
+
+; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
+; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
+; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
+
+; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
+; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]]
+; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]]
+
+; GCN: buffer_store_dword [[MUL1]]
+; GCN-NEXT:	buffer_store_dword [[MUL2]]
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %d = load volatile float, float addrspace(1)* %d.gep
+
+  %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
+  %fneg.fma0 = fsub float -0.0, %fma0
+  %mul1 = fmul float %fneg.fma0, %c
+  %mul2 = fmul float %fneg.fma0, %d
+
+  store volatile float %mul1, float addrspace(1)* %out
+  store volatile float %mul2, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
+; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
+
+; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
+; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
+; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
+
+; GCN: buffer_store_dwordx2 [[MUL0]]
+; GCN: buffer_store_dwordx2 [[MUL1]]
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
+  %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %b = load volatile double, double addrspace(1)* %b.gep
+  %c = load volatile double, double addrspace(1)* %c.gep
+  %d = load volatile double, double addrspace(1)* %d.gep
+
+  %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
+  %fneg.fma0 = fsub double -0.0, %fma0
+  %mul1 = fmul double %fneg.fma0, %c
+  %mul2 = fmul double %fneg.fma0, %d
+
+  store volatile double %mul1, double addrspace(1)* %out
+  store volatile double %mul2, double addrspace(1)* %out
+  ret void
+}
+
+; %trunc.a has one fneg use, but it requires a code size increase and
+; %the fneg can instead be folded for free into the fma.
+
+; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
+; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
+; GCN: buffer_store_dword [[FMA0]]
+define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %d = load volatile float, float addrspace(1)* %d.gep
+
+  %trunc.a = call float @llvm.trunc.f32(float %a)
+  %trunc.fneg.a = fsub float -0.0, %trunc.a
+  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
+  store volatile float %fma0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
+; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
+; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
+; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
+; GCN: buffer_store_dword [[FMA0]]
+; GCN: buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %d = load volatile float, float addrspace(1)* %d.gep
+
+  %trunc.a = call float @llvm.trunc.f32(float %a)
+  %trunc.fneg.a = fsub float -0.0, %trunc.a
+  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
+  %mul1 = fmul float %trunc.a, %d
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %mul1, float addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fma.f32(float, float, float) #1
 declare float @llvm.fmuladd.f32(float, float, float) #1
 declare float @llvm.sin.f32(float) #1
+declare float @llvm.trunc.f32(float) #1
+declare float @llvm.round.f32(float) #1
+declare float @llvm.rint.f32(float) #1
+declare float @llvm.nearbyint.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+
+declare double @llvm.fma.f64(double, double, double) #1
 
 declare float @llvm.amdgcn.sin.f32(float) #1
 declare float @llvm.amdgcn.rcp.f32(float) #1
 declare float @llvm.amdgcn.rcp.legacy(float) #1
 declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index d7d21311c1b9..555764c15519 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -1,33 +1,35 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN -check-prefix=CIVI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
 ; CI: v_cvt_f32_f16_e32
-; CI: v_cvt_f32_f16_e32
-; CI: v_sub_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |v{{[0-9]+}}|
+; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}|
+; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}}
 
-; VI-NOT: and
-; VI: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
-define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
+; GFX89-NOT: _and
+; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
+define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   %fadd = fadd half %y, %fsub
   store half %fadd, half addrspace(1)* %out, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16:
-; CI: v_cvt_f32_f16_e32
-; CI: v_cvt_f32_f16_e32
-; CI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}|
+; CI-DAG: v_cvt_f32_f16_e32
+; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}|
+; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}}
 ; CI: v_cvt_f16_f32_e32
 
-; VI-NOT: and
-; VI: v_mul_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}|
-; VI-NOT: and
-define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
+; GFX89-NOT: _and
+; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
+; GFX89-NOT: [[MUL]]
+; GFX89: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   %fmul = fmul half %y, %fsub
   store half %fmul, half addrspace(1)* %out, align 2
   ret void
@@ -39,75 +41,113 @@ define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
 
 ; GCN-LABEL: {{^}}fneg_fabs_free_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc = bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   store half %fsub, half addrspace(1)* %out
   ret void
 }
 
-; FIXME: Should use or
 ; GCN-LABEL: {{^}}fneg_fabs_f16:
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   store half %fsub, half addrspace(1)* %out, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fneg_fabs_f16:
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %val = load half, half addrspace(1)* %in, align 2
   %fabs = call half @llvm.fabs.f16(half %val)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   store half %fsub, half addrspace(1)* %out, align 2
   ret void
 }
 
 ; FIXME: single bit op
-; GCN-LABEL: {{^}}fneg_fabs_v2f16:
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: flat_store_dword
-define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:
+; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: flat_store_dword
+
+; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
+define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
-  %fsub = fsub <2 x half> <half -0.000000e+00, half -0.000000e+00>, %fabs
-  store <2 x half> %fsub, <2 x half> addrspace(1)* %out
+  %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
+  store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: flat_store_dwordx2
-define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+
+; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
+; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
+; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
+
+; GCN: flat_store_dwordx2
+define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
-  %fsub = fsub <4 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %fabs
+  %fsub = fsub <4 x half> <half -0.0, half -0.0, half -0.0, half -0.0>, %fabs
   store <4 x half> %fsub, <4 x half> addrspace(1)* %out
   ret void
 }
 
-declare half @llvm.fabs.f16(half) readnone
-declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone
-declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone
+; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
+; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+
+; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
+; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
+
+; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
+define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
+  %mul = fmul <2 x half> %fneg.fabs, <half 4.0, half 4.0>
+  store <2 x half> %mul, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_v2f16:
+; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
+; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]]
+; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]]
+define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
+  store <2 x half> %fabs, <2 x half> addrspace(1)* %out0
+  store <2 x half> %fneg, <2 x half> addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16:
+; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
+define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
+  %mul = fmul <2 x half> %fneg, <half 4.0, half 4.0>
+  store <2 x half> %fabs, <2 x half> addrspace(1)* %out0
+  store <2 x half> %mul, <2 x half> addrspace(1)* %out1
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index d16e83fd4d5b..85f544032171 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f64:
 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
-define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
   %fadd = fadd double %y, %fsub
@@ -14,7 +14,7 @@ define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y)
   ret void
 }
 
-define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
   %x = load double, double addrspace(1)* %xptr, align 8
   %y = load double, double addrspace(1)* %xptr, align 8
   %fabs = call double @llvm.fabs.f64(double %x)
@@ -26,7 +26,7 @@ define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)
 
 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f64:
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
-define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
   %fmul = fmul double %y, %fsub
@@ -35,7 +35,7 @@ define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y)
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_free_f64:
-define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -46,7 +46,7 @@ define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
 ; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64:
 ; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -62,7 +62,7 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
 ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
-define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   %fsub = fsub double -0.000000e+00, %fabs
   store double %fsub, double addrspace(1)* %out, align 8
@@ -74,7 +74,7 @@ define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
 ; GCN-NOT: 0x80000000
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
   store <2 x double> %fsub, <2 x double> addrspace(1)* %out
@@ -88,7 +88,7 @@ define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in)
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
   store <4 x double> %fsub, <4 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll
index 9ee1171306c7..a0cf37b159db 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
 ; SI-NOT: and
 ; SI: v_subrev_f32_e64 {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
   %fadd = fadd float %y, %fsub
@@ -17,7 +17,7 @@ define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
 ; SI-NOT: and
 ; SI: v_mul_f32_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|, {{s[0-9]+}}
 ; SI-NOT: and
-define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
   %fmul = fmul float %y, %fsub
@@ -35,7 +35,7 @@ define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -49,7 +49,7 @@ define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -59,7 +59,7 @@ define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
 
 ; FUNC-LABEL: {{^}}fneg_fabs_f32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   %fsub = fsub float -0.000000e+00, %fabs
   store float %fsub, float addrspace(1)* %out, align 4
@@ -68,7 +68,7 @@ define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
 
 ; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %val = load float, float addrspace(1)* %in, align 4
   %fabs = call float @llvm.fabs.f32(float %val)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -86,7 +86,7 @@ define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
-define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
   store <2 x float> %fsub, <2 x float> addrspace(1)* %out
@@ -99,7 +99,7 @@ define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
-define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
   store <4 x float> %fsub, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll
index e3dfd9201a24..626a0b50cce8 100644
--- a/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
 
 ; FIXME: Should be able to do scalar op
-; FUNC-LABEL: {{^}}s_fneg_f16:
-
-define void @s_fneg_f16(half addrspace(1)* %out, half %in) {
-  %fneg = fsub half -0.000000e+00, %in
+; GCN-LABEL: {{^}}s_fneg_f16:
+define amdgpu_kernel void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {
+  %fneg = fsub half -0.0, %in
   store half %fneg, half addrspace(1)* %out
   ret void
 }
@@ -13,49 +13,123 @@ define void @s_fneg_f16(half addrspace(1)* %out, half %in) {
 ; FIXME: Should be able to use bit operations when illegal type as
 ; well.
 
-; FUNC-LABEL: {{^}}v_fneg_f16:
+; GCN-LABEL: {{^}}v_fneg_f16:
 ; GCN: flat_load_ushort [[VAL:v[0-9]+]],
-
-; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]]
-; CI: v_cvt_f16_f32_e64 [[CVT1:v[0-9]+]], -[[CVT0]]
-; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
-
-; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
+; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
-define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
-  %val = load half, half addrspace(1)* %in, align 2
-  %fneg = fsub half -0.000000e+00, %val
-  store half %fneg, half addrspace(1)* %out
+; SI: buffer_store_short [[XOR]]
+define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
+  %val = load half, half addrspace(1)* %gep.in, align 2
+  %fneg = fsub half -0.0, %val
+  store half %fneg, half addrspace(1)* %gep.out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_free_f16:
+; GCN-LABEL: {{^}}fneg_free_f16:
 ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]],
 
 ; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
 ; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
-define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
   %bc = bitcast i16 %in to half
   %fsub = fsub half -0.0, %bc
   store half %fsub, half addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_fneg_fold_f16:
+; GCN-LABEL: {{^}}v_fneg_fold_f16:
 ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]]
 
-; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[CVT0]]
-; CI: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[CVT0]], [[CVT0]]
+; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]]
+; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]]
+; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]]
 ; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
 
 ; VI-NOT: [[NEG_VALUE]]
 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %fsub = fsub half -0.0, %val
   %fmul = fmul half %fsub, %val
   store half %fmul, half addrspace(1)* %out
   ret void
 }
+
+; FIXME: Terrible code with VI and even worse with SI/CI
+; GCN-LABEL: {{^}}s_fneg_v2f16:
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_or_b32_e32
+
+; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}}
+; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
+
+; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
+
+define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
+  store <2 x half> %fneg, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]]
+define amdgpu_kernel void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
+  store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_free_v2f16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; CIVI: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000
+
+; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]]
+define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+  %bc = bitcast i32 %in to <2 x half>
+  %fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc
+  store <2 x half> %fsub, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+
+; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
+; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
+; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
+
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}
+define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %fsub = fsub <2 x half> <half -0.0, half -0.0>, %val
+  %fmul = fmul <2 x half> %fsub, %val
+  store <2 x half> %fmul, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fneg.f64.ll b/test/CodeGen/AMDGPU/fneg.f64.ll
index b7080f4622a3..9b4b4d6e942a 100644
--- a/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_f64:
 ; GCN: v_xor_b32
-define void @fneg_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double %in) {
   %fneg = fsub double -0.000000e+00, %in
   store double %fneg, double addrspace(1)* %out
   ret void
@@ -12,7 +12,7 @@ define void @fneg_f64(double addrspace(1)* %out, double %in) {
 ; FUNC-LABEL: {{^}}fneg_v2f64:
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
+define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
   %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
   store <2 x double> %fneg, <2 x double> addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double>
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
+define amdgpu_kernel void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
   %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
   store <4 x double> %fneg, <4 x double> addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double>
 
 ; FUNC-LABEL: {{^}}fneg_free_f64:
 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fsub = fsub double 0.0, %bc
   store double %fsub, double addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
 ; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-NOT: xor
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
   %fsub = fsub double -0.0, %in
   %fmul = fmul double %fsub, %in
   store double %fmul, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fneg.ll b/test/CodeGen/AMDGPU/fneg.ll
index 007c6dcadd9e..d1eabfb13c9a 100644
--- a/test/CodeGen/AMDGPU/fneg.ll
+++ b/test/CodeGen/AMDGPU/fneg.ll
@@ -6,7 +6,7 @@
 ; R600: -PV
 
 ; GCN: v_xor_b32
-define void @s_fneg_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @s_fneg_f32(float addrspace(1)* %out, float %in) {
   %fneg = fsub float -0.000000e+00, %in
   store float %fneg, float addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @s_fneg_f32(float addrspace(1)* %out, float %in) {
 
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
   %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
   store <2 x float> %fneg, <2 x float> addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float>
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
   %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
   store <4 x float> %fneg, <4 x float> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float>
 
 ; R600-NOT: XOR
 ; R600: -KC0[2].Z
-define void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float 0.0, %bc
   store float %fsub, float addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
 
 ; R600-NOT: XOR
 ; R600: -PV.W
-define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float -0.0, %bc
   store float %fsub, float addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
 ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: xor
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
   %fsub = fsub float -0.0, %in
   %fmul = fmul float %fsub, %in
   store float %fmul, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fold-cndmask.mir b/test/CodeGen/AMDGPU/fold-cndmask.mir
new file mode 100644
index 000000000000..8dfec9166303
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fold-cndmask.mir
@@ -0,0 +1,34 @@
+# RUN: llc -march=amdgcn -run-pass si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s
+
+# CHECK: %1 = V_MOV_B32_e32 0, implicit %exec
+# CHECK: %2 = V_MOV_B32_e32 0, implicit %exec
+# CHECK: %4 = COPY %3
+# CHECK: %5 = V_MOV_B32_e32 0, implicit %exec
+# CHECK: %6 = V_MOV_B32_e32 0, implicit %exec
+# CHECK: %7 = COPY %3
+
+---
+name:            fold_cndmask
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: vgpr_32 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+body:             |
+  bb.0.entry:
+    %0 = IMPLICIT_DEF
+    %1 = V_CNDMASK_B32_e64 0, 0, %0, implicit %exec
+    %2 = V_CNDMASK_B32_e64 %1, %1, %0, implicit %exec
+    %3 = IMPLICIT_DEF
+    %4 = V_CNDMASK_B32_e64 %3, %3, %0, implicit %exec
+    %5 = COPY %1
+    %6 = V_CNDMASK_B32_e64 %5, 0, %0, implicit %exec
+    %vcc = IMPLICIT_DEF
+    %7 = V_CNDMASK_B32_e32 %3, %3, implicit %exec, implicit %vcc
+
+...
diff --git a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
new file mode 100644
index 000000000000..986c6b296c96
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
@@ -0,0 +1,306 @@
+# RUN: llc -march=amdgcn -run-pass peephole-opt -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @no_fold_imm_madak_mac_clamp_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_kernel void @no_fold_imm_madak_mac_omod_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_kernel void @no_fold_imm_madak_mad_clamp_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_kernel void @no_fold_imm_madak_mad_omod_f32() #0 {
+    ret void
+  }
+
+  attributes #0 = { nounwind }
+
+...
+---
+# GCN-LABEL: name: no_fold_imm_madak_mac_clamp_f32
+# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
+# GCN-NEXT: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
+
+name:            no_fold_imm_madak_mac_clamp_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_64_xexec }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sgpr_64 }
+  - { id: 14, class: sgpr_128 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: sgpr_128 }
+  - { id: 18, class: sgpr_128 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vreg_64 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %28 = REG_SEQUENCE %3, 1, %27, 2
+    %11 = S_MOV_B32 61440
+    %12 = S_MOV_B32 0
+    %13 = REG_SEQUENCE killed %12, 1, killed %11, 2
+    %14 = REG_SEQUENCE killed %5, 17, %13, 18
+    %15 = S_MOV_B32 2
+    %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec
+    %17 = REG_SEQUENCE killed %6, 17, %13, 18
+    %18 = REG_SEQUENCE killed %4, 17, %13, 18
+    %20 = COPY %29
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec
+    %22 = COPY %29
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec
+    %23 = V_MOV_B32_e32 1090519040, implicit %exec
+    %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
+    %26 = COPY %29
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: no_fold_imm_madak_mac_omod_f32
+# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
+# GCN: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit %exec
+
+name:            no_fold_imm_madak_mac_omod_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_64_xexec }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sgpr_64 }
+  - { id: 14, class: sgpr_128 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: sgpr_128 }
+  - { id: 18, class: sgpr_128 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vreg_64 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %28 = REG_SEQUENCE %3, 1, %27, 2
+    %11 = S_MOV_B32 61440
+    %12 = S_MOV_B32 0
+    %13 = REG_SEQUENCE killed %12, 1, killed %11, 2
+    %14 = REG_SEQUENCE killed %5, 17, %13, 18
+    %15 = S_MOV_B32 2
+    %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec
+    %17 = REG_SEQUENCE killed %6, 17, %13, 18
+    %18 = REG_SEQUENCE killed %4, 17, %13, 18
+    %20 = COPY %29
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec
+    %22 = COPY %29
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec
+    %23 = V_MOV_B32_e32 1090519040, implicit %exec
+    %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit %exec
+    %26 = COPY %29
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# GCN: name: no_fold_imm_madak_mad_clamp_f32
+# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
+# GCN: %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
+
+name:            no_fold_imm_madak_mad_clamp_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_64_xexec }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sgpr_64 }
+  - { id: 14, class: sgpr_128 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: sgpr_128 }
+  - { id: 18, class: sgpr_128 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vreg_64 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %28 = REG_SEQUENCE %3, 1, %27, 2
+    %11 = S_MOV_B32 61440
+    %12 = S_MOV_B32 0
+    %13 = REG_SEQUENCE killed %12, 1, killed %11, 2
+    %14 = REG_SEQUENCE killed %5, 17, %13, 18
+    %15 = S_MOV_B32 2
+    %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec
+    %17 = REG_SEQUENCE killed %6, 17, %13, 18
+    %18 = REG_SEQUENCE killed %4, 17, %13, 18
+    %20 = COPY %29
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec
+    %22 = COPY %29
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec
+    %23 = V_MOV_B32_e32 1090519040, implicit %exec
+    %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
+    %26 = COPY %29
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# GCN: name: no_fold_imm_madak_mad_omod_f32
+# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
+# GCN: %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit %exec
+
+name:            no_fold_imm_madak_mad_omod_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_64_xexec }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sgpr_64 }
+  - { id: 14, class: sgpr_128 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: sgpr_128 }
+  - { id: 18, class: sgpr_128 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vreg_64 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %28 = REG_SEQUENCE %3, 1, %27, 2
+    %11 = S_MOV_B32 61440
+    %12 = S_MOV_B32 0
+    %13 = REG_SEQUENCE killed %12, 1, killed %11, 2
+    %14 = REG_SEQUENCE killed %5, 17, %13, 18
+    %15 = S_MOV_B32 2
+    %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec
+    %17 = REG_SEQUENCE killed %6, 17, %13, 18
+    %18 = REG_SEQUENCE killed %4, 17, %13, 18
+    %20 = COPY %29
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec
+    %22 = COPY %29
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec
+    %23 = V_MOV_B32_e32 1090519040, implicit %exec
+    %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit %exec
+    %26 = COPY %29
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/fp-classify.ll b/test/CodeGen/AMDGPU/fp-classify.ll
index b7ffaed70c5a..cbc42979f2ee 100644
--- a/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/test/CodeGen/AMDGPU/fp-classify.ll
@@ -9,7 +9,7 @@ declare double @llvm.fabs.f64(double) #1
 ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
 ; SI-NOT: v_cmp
 ; SI: s_endpgm
-define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
   %ext = zext i1 %cmp to i32
@@ -20,7 +20,7 @@ define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
 ; SI-LABEL: {{^}}test_not_isinf_pattern_0:
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp ueq float %fabs, 0x7FF0000000000000
   %ext = zext i1 %cmp to i32
@@ -31,7 +31,7 @@ define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x
 ; SI-LABEL: {{^}}test_not_isinf_pattern_1:
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0xFFF0000000000000
   %ext = zext i1 %cmp to i32
@@ -45,7 +45,7 @@ define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x
 ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
 ; SI-NOT: v_cmp
 ; SI: s_endpgm
-define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -59,7 +59,7 @@ define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x)
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_0:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0xFFF0000000000000
@@ -73,7 +73,7 @@ define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_1:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %ninf = fcmp une float %x, 0x7FF0000000000000
   %and = and i1 %ord, %ninf
@@ -86,7 +86,7 @@ define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_2:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %y) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -100,7 +100,7 @@ define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_3:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp uno float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -114,7 +114,7 @@ define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_4:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp one float %x.fabs, 0x7FF0000000000000
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index 01bc53ff35a5..ce041364b76d 100644
--- a/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -14,7 +14,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
 ; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]]
 ; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]]
 ; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
   %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index a9f493bf0ccd..70f0c0c1afdb 100644
--- a/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
 ; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
   %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
   store double %cvt, double addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 3e426e3e94b1..2c6b1cb18f7e 100644
--- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -12,7 +12,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
 ; EG: MEM_RAT MSKOR
 ; EG: VTX_READ_32
 ; EG: FLT32_TO_FLT16
-define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
   store i16 %cvt, i16 addrspace(1)* %out, align 2
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index 1537d67cadcc..a7cddd09b762 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -6,7 +6,7 @@ declare double @llvm.fabs.f64(double) #1
 
 ; FUNC-LABEL: @fp_to_sint_f64_i32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
   %result = fptosi double %in to i32
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
 ; FUNC-LABEL: @fp_to_sint_v2f64_v2i32
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
   %result = fptosi <2 x double> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
   %result = fptosi <4 x double> %in to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -47,7 +47,7 @@ define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
@@ -58,7 +58,7 @@ define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in
 
 ; FUNC-LABEL: {{^}}fp_to_sint_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{\[[0-9]+:[0-9]+\]}}
-define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %conv = fptosi double %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_sint_fabs_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{\[[0-9]+:[0-9]+\]}}|
-define void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %in.fabs = call double @llvm.fabs.f64(double %in)
   %conv = fptosi double %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll
index a2fa7a190745..630a7186e101 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -8,7 +8,7 @@ declare float @llvm.fabs.f32(float) #1
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: s_endpgm
-define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
   %conv = fptosi float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
 
 ; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
 ; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
-define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i32
   store i32 %conv, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptosi <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptosi <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -76,7 +76,7 @@ define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspac
 
 ; Check that the compiler doesn't crash with a "cannot select" error
 ; SI: s_endpgm
-define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
 entry:
   %0 = fptosi float %in to i64
   store i64 %0, i64 addrspace(1)* %out
@@ -128,7 +128,7 @@ entry:
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
   %conv = fptosi <2 x float> %x to <2 x i64>
   store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
   ret void
@@ -221,7 +221,7 @@ define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   %conv = fptosi <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
@@ -233,7 +233,7 @@ define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
 ; EG: AND_INT
 ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, literal.y,
 ; EG-NEXT: -1082130432(-1.000000e+00)
-define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %conv = fptosi float %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -241,7 +241,7 @@ define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
 ; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{[0-9]+}}|
-define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
@@ -251,7 +251,7 @@ define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 ; FUNC-LABEL: {{^}}fp_to_sint_f32_i16:
 ; GCN: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_short [[VAL]]
-define void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
   %sint = fptosi float %in to i16
   store i16 %sint, i16 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index d5bc416434df..4f597eb3f32c 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -6,7 +6,7 @@ declare double @llvm.fabs.f64(double) #1
 
 ; SI-LABEL: {{^}}fp_to_uint_i32_f64:
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
   %cast = fptoui double %in to i32
   store i32 %cast, i32 addrspace(1)* %out, align 4
   ret void
@@ -15,7 +15,7 @@ define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
 ; SI-LABEL: @fp_to_uint_v2i32_v2f64
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
   %cast = fptoui <2 x double> %in to <2 x i32>
   store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -26,7 +26,7 @@ define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
   %cast = fptoui <4 x double> %in to <4 x i32>
   store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8
   ret void
@@ -47,7 +47,7 @@ define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
@@ -57,14 +57,14 @@ define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in
 }
 
 ; SI-LABEL: @fp_to_uint_v2i64_v2f64
-define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
   %cast = fptoui <2 x double> %in to <2 x i64>
   store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: @fp_to_uint_v4i64_v4f64
-define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
   %cast = fptoui <4 x double> %in to <4 x i64>
   store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
   ret void
@@ -72,7 +72,7 @@ define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %
 
 ; FUNC-LABEL: {{^}}fp_to_uint_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{\[[0-9]+:[0-9]+\]}}
-define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %conv = fptoui double %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -80,7 +80,7 @@ define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{\[[0-9]+:[0-9]+\]}}|
-define void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %in.fabs = call double @llvm.fabs.f64(double %in)
   %conv = fptoui double %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll
index cbff9f22b073..fdb15801dc4e 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) #1
 
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: s_endpgm
-define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
   %conv = fptoui float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -21,7 +21,7 @@ define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
 
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
-define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptoui <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -37,7 +37,7 @@ define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float>
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
 
-define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptoui <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -68,7 +68,7 @@ define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float>
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
   %conv = fptoui float %x to i64
   store i64 %conv, i64 addrspace(1)* %out
   ret void
@@ -119,7 +119,7 @@ define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
   %conv = fptoui <2 x float> %x to <2 x i64>
   store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
   ret void
@@ -212,7 +212,7 @@ define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float>
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   %conv = fptoui <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@ define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float>
 
 ; EG: AND_INT
 ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, 1.0,
-define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %conv = fptoui float %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -232,7 +232,7 @@ define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
 ; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{[0-9]+}}|
-define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptoui float %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
@@ -246,7 +246,7 @@ define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 ; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_short [[VAL]]
-define void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
   %uint = fptoui float %in to i16
   store i16 %uint, i16 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll
index c4f5d7cdfb5d..03657176c383 100644
--- a/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,14 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s
 
 ; GCN-LABEL: {{^}}fpext_f16_to_f32
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_dword v[[R_F32]]
 ; GCN: s_endpgm
-define void @fpext_f16_to_f32(
+define amdgpu_kernel void @fpext_f16_to_f32(
     float addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    half addrspace(1)* %a) #0 {
 entry:
   %a.val = load half, half addrspace(1)* %a
   %r.val = fpext half %a.val to float
@@ -22,9 +23,9 @@ entry:
 ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fpext_f16_to_f64(
+define amdgpu_kernel void @fpext_f16_to_f64(
     double addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    half addrspace(1)* %a) #0 {
 entry:
   %a.val = load half, half addrspace(1)* %a
   %r.val = fpext half %a.val to double
@@ -34,15 +35,17 @@ entry:
 
 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
+; GFX9-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
+; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
+; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fpext_v2f16_to_v2f32(
+
+define amdgpu_kernel void @fpext_v2f16_to_v2f32(
     <2 x float> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a) {
+    <2 x half> addrspace(1)* %a) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %r.val = fpext <2 x half> %a.val to <2 x float>
@@ -51,15 +54,18 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_cvt_f64_f32_e32 v{{\[}}{{[0-9]+}}:[[R_F64_3:[0-9]+]]{{\]}}, v[[A_F32_1]]
-; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:{{[0-9]+}}{{\]}}, v[[A_F32_0]]
-; GCN: buffer_store_dwordx4 v{{\[}}[[R_F64_0]]:[[R_F64_3]]{{\]}}
+; GCN: buffer_load_dword
+; SIGFX9-DAG: v_lshrrev_b32_e32
+; SIGFX9-DAG: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
+; GCN: v_cvt_f32_f16_e32
+
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @fpext_v2f16_to_v2f64(
+
+define amdgpu_kernel void @fpext_v2f16_to_v2f64(
     <2 x double> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -68,3 +74,202 @@ entry:
   store <2 x double> %r.val, <2 x double> addrspace(1)* %r
   ret void
 }
+
+; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32:
+; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) {
+entry:
+  %a.trunc = trunc i32 %a to i16
+  %a.val = bitcast i16 %a.trunc to half
+  %r.val = fpext half %a.val to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]]
+define amdgpu_kernel void @fneg_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.neg = fsub half -0.0, %a.val
+  %r.val = fpext half %a.neg to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]|
+define amdgpu_kernel void @fabs_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %r.val = fpext half %a.fabs to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]|
+define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %a.fneg.fabs = fsub half -0.0, %a.fabs
+  %r.val = fpext half %a.fneg.fabs to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_multi_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[A]]
+
+; FIXME: Using the source modifier here only wastes code size
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
+
+; GCN: store_dword [[CVT]]
+; GCN: store_short [[XOR]]
+define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.neg = fsub half -0.0, %a.val
+  %r.val = fpext half %a.neg to float
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %a.neg, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_multi_foldable_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]]
+; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
+; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]]
+; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
+
+; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]]
+; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]]
+
+; GCN: buffer_store_dword [[CVTA_NEG]]
+; GCN: buffer_store_short [[MUL]]
+define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.neg = fsub half -0.0, %a.val
+  %r.val = fpext half %a.neg to float
+  %mul = fmul half %a.neg, %a.val
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %mul, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_multi_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_and_b32_e32 [[XOR:v[0-9]+]], 0x7fff, [[A]]
+
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |[[A]]|
+
+; GCN: store_dword [[CVT]]
+; GCN: store_short [[XOR]]
+define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %r.val = fpext half %a.fabs to float
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %a.fabs, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_multi_foldable_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
+; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], |[[CVTA]]|, [[CVTA]]
+; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
+; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]]
+
+; GFX89-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]|
+; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]]
+
+; GCN: buffer_store_dword [[ABS_A]]
+; GCN: buffer_store_short [[MUL]]
+define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %r.val = fpext half %a.fabs to float
+  %mul = fmul half %a.fabs, %a.val
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %mul, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fneg_multi_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x8000, [[A]]
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[OR]]
+; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[OR]]|
+
+; GCN: buffer_store_dword [[CVT]]
+; GCN: buffer_store_short [[OR]]
+define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %a.fneg.fabs = fsub half -0.0, %a.fabs
+  %r.val = fpext half %a.fneg.fabs to float
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %a.fneg.fabs, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
+; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], -|[[CVTA]]|, [[CVTA]]
+; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
+; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]]
+
+; GFX89-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]|
+; GFX89-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]]
+
+; GCN: buffer_store_dword [[FABS_FNEG]]
+; GCN: buffer_store_short [[MUL]]
+define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %a.fneg.fabs = fsub half -0.0, %a.fabs
+  %r.val = fpext half %a.fneg.fabs to float
+  %mul = fmul half %a.fneg.fabs, %a.val
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %mul, half addrspace(1)* undef
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fpext.ll b/test/CodeGen/AMDGPU/fpext.ll
index 6dc84b01d734..b11e2ea056c3 100644
--- a/test/CodeGen/AMDGPU/fpext.ll
+++ b/test/CodeGen/AMDGPU/fpext.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fpext_f32_to_f64:
 ; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
   %result = fpext float %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -12,7 +12,7 @@ define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
 ; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64:
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
   %result = fpext <2 x float> %in to <2 x double>
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@ define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
+define amdgpu_kernel void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
   %result = fpext <3 x float> %in to <3 x double>
   store <3 x double> %result, <3 x double> addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@ define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
   %result = fpext <4 x float> %in to <4 x double>
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
+define amdgpu_kernel void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
   %result = fpext <8 x float> %in to <8 x double>
   store <8 x double> %result, <8 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fptosi.f16.ll b/test/CodeGen/AMDGPU/fptosi.f16.ll
index 71f56d730e96..50e56e08416a 100644
--- a/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_I16]]
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i16(
+define amdgpu_kernel void @fptosi_f16_to_i16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -23,7 +23,7 @@ entry:
 ; GCN: v_cvt_i32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i32(
+define amdgpu_kernel void @fptosi_f16_to_i32(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -40,7 +40,7 @@ entry:
 ; GCN: buffer_load_ushort
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i64(
+define amdgpu_kernel void @fptosi_f16_to_i64(
     i64 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -52,17 +52,26 @@ entry:
 
 ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
-; GCN: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
-; GCN: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
+
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
+; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
+; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
+; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
+; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
+
+; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
+; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
 ; GCN: buffer_store_dword v[[R_V2_I16]]
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i16(
+
+define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
     <2 x i16> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -75,12 +84,13 @@ entry:
 ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32
 ; GCN: buffer_load_dword
 ; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
 ; GCN: v_cvt_i32_f32_e32
 ; GCN: v_cvt_i32_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i32(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -96,9 +106,10 @@ entry:
 ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64
 ; GCN: buffer_load_dword
 ; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i64(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fptoui.f16.ll b/test/CodeGen/AMDGPU/fptoui.f16.ll
index a6876624a0c6..2afa6111cf17 100644
--- a/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -8,7 +8,7 @@
 ; VI:  v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_I16]]
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i16(
+define amdgpu_kernel void @fptoui_f16_to_i16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@ entry:
 ; GCN: v_cvt_u32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i32(
+define amdgpu_kernel void @fptoui_f16_to_i32(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -41,7 +41,7 @@ entry:
 ; GCN: buffer_load_ushort
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i64(
+define amdgpu_kernel void @fptoui_f16_to_i64(
     i64 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -53,18 +53,25 @@ entry:
 
 ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i16
 ; GCN:     buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+
+; SI:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; SI:      v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
 ; SI:      v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
-; VI:      v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
+; SI:     v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
+; SI:     v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
+
+; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI:      v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
-; GCN:     v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; GCN:     v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
+; VI:      v_cvt_i32_f32_sdwa v[[R_I16_0:[0-9]+]], v[[A_F32_0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI:     v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
 ; GCN:     buffer_store_dword v[[R_V2_I16]]
 ; GCN:     s_endpgm
-define void @fptoui_v2f16_to_v2i16(
+
+define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
     <2 x i16> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -77,12 +84,13 @@ entry:
 ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i32
 ; GCN: buffer_load_dword
 ; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @fptoui_v2f16_to_v2i32(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -98,9 +106,10 @@ entry:
 ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64
 ; GCN: buffer_load_dword
 ; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
 ; GCN: s_endpgm
-define void @fptoui_v2f16_to_v2i64(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 284fc53c8240..bc72f4424c98 100644
--- a/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,12 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
-; GCN-LABEL: {{^}}fptrunc_f32_to_f16
+; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fptrunc_f32_to_f16(
+define amdgpu_kernel void @fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
@@ -16,13 +17,13 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fptrunc_f64_to_f16
+; GCN-LABEL: {{^}}fptrunc_f64_to_f16:
 ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}}
 ; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}}
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fptrunc_f64_to_f16(
+define amdgpu_kernel void @fptrunc_f64_to_f16(
     half addrspace(1)* %r,
     double addrspace(1)* %a) {
 entry:
@@ -32,16 +33,24 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16
+; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16:
 ; GCN:     buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
-; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
-; GCN-DAG: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:      v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
+; GFX9-DAG:   v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
+; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
+; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
+
 ; GCN:     buffer_store_dword v[[R_V2_F16]]
 ; GCN:     s_endpgm
-define void @fptrunc_v2f32_to_v2f16(
+
+define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x float> addrspace(1)* %a) {
 entry:
@@ -51,17 +60,23 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16
+; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16:
 ; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}}
-; GCN: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}}
-; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}}
+; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
+; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
+
+; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+
+; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
+; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
+; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-define void @fptrunc_v2f64_to_v2f16(
+
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x double> addrspace(1)* %a) {
 entry:
@@ -70,3 +85,109 @@ entry:
   store <2 x half> %r.val, <2 x half> addrspace(1)* %r
   ret void
 }
+
+; GCN-LABEL: {{^}}fneg_fptrunc_f32_to_f16:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]]
+; GCN: buffer_store_short v[[R_F16]]
+; GCN: s_endpgm
+define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
+    half addrspace(1)* %r,
+    float addrspace(1)* %a) {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fneg = fsub float -0.0, %a.val
+  %r.val = fptrunc float %a.fneg to half
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fptrunc_f32_to_f16:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
+; GCN: buffer_store_short v[[R_F16]]
+; GCN: s_endpgm
+define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
+    half addrspace(1)* %r,
+    float addrspace(1)* %a) {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fabs = call float @llvm.fabs.f32(float %a.val)
+  %r.val = fptrunc float %a.fabs to half
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fabs_fptrunc_f32_to_f16:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]|
+; GCN: buffer_store_short v[[R_F16]]
+; GCN: s_endpgm
+define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
+    half addrspace(1)* %r,
+    float addrspace(1)* %a) #0 {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fabs = call float @llvm.fabs.f32(float %a.val)
+  %a.fneg.fabs = fsub float -0.0, %a.fabs
+  %r.val = fptrunc float %a.fneg.fabs to half
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
+; GCN-NOT: v[[R_F16]]
+; GCN: buffer_store_dword v[[R_F16]]
+define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
+    i32 addrspace(1)* %r,
+    float addrspace(1)* %a) #0 {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %r.val = fptrunc float %a.val to half
+  %r.i16 = bitcast half %r.val to i16
+  %zext = zext i16 %r.i16 to i32
+  store i32 %zext, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
+; GCN-NOT: v[[R_F16]]
+; GCN: buffer_store_dword v[[R_F16]]
+define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
+    i32 addrspace(1)* %r,
+    float addrspace(1)* %a) #0 {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fabs = call float @llvm.fabs.f32(float %a.val)
+  %r.val = fptrunc float %a.fabs to half
+  %r.i16 = bitcast half %r.val to i16
+  %zext = zext i16 %r.i16 to i32
+  store i32 %zext, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fptrunc_f32_to_f16_sext_i32:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
+; GCN: v_bfe_i32 v[[R_F16_SEXT:[0-9]+]], v[[R_F16]], 0, 16
+; GCN: buffer_store_dword v[[R_F16_SEXT]]
+define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
+    i32 addrspace(1)* %r,
+    float addrspace(1)* %a) #0 {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %r.val = fptrunc float %a.val to half
+  %r.i16 = bitcast half %r.val to i16
+  %zext = sext i16 %r.i16 to i32
+  store i32 %zext, i32 addrspace(1)* %r
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fptrunc.ll b/test/CodeGen/AMDGPU/fptrunc.ll
index 0c7b67406a89..d9c5b7e6f359 100644
--- a/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/test/CodeGen/AMDGPU/fptrunc.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
 ; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -14,7 +14,7 @@ define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
 ; GCN-NOT: v_cvt
 ; GCN-UNSAFE: v_cvt_f32_f64_e32 [[F32:v[0-9]+]]
 ; GCN-UNSAFE: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[F32]]
-define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to half
   %result_i16 = bitcast half %result to i16
   store i16 %result_i16, i16 addrspace(1)* %out
@@ -24,7 +24,7 @@ define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
 ; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
   %result = fptrunc <2 x double> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -35,7 +35,7 @@ define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double>
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
   %result = fptrunc <4 x double> %in to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double>
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
+define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
   %result = fptrunc <8 x double> %in to <8 x float>
   store <8 x float> %result, <8 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll
index 0651dce8d95c..7a5bcfffa3f3 100644
--- a/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/test/CodeGen/AMDGPU/fract.f64.ll
@@ -27,7 +27,7 @@ declare double @llvm.floor.f64(double) #0
 ; GCN-UNSAFE: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %floor.x = call double @llvm.floor.f64(double %x)
   %fract = fsub double %x, %floor.x
@@ -54,7 +54,7 @@ define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1
 ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]]
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %neg.x = fsub double -0.0, %x
   %floor.neg.x = call double @llvm.floor.f64(double %neg.x)
@@ -82,7 +82,7 @@ define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src)
 ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]|
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %abs.x = call double @llvm.fabs.f64(double %x)
   %neg.abs.x = fsub double -0.0, %abs.x
@@ -98,7 +98,7 @@ define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %
 ; VI-UNSAFE-DAG: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
 ; VI-UNSAFE: buffer_store_dwordx2 [[FLOOR]]
 ; VI-UNSAFE: buffer_store_dwordx2 [[FRACT]]
-define void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %floor.x = call double @llvm.floor.f64(double %x)
   %fract = fsub double %x, %floor.x
diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll
index 4e1a503b1298..207fe280c9a6 100644
--- a/test/CodeGen/AMDGPU/fract.ll
+++ b/test/CodeGen/AMDGPU/fract.ll
@@ -14,7 +14,7 @@ declare float @llvm.floor.f32(float) #0
 ; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %floor.x = call float @llvm.floor.f32(float %x)
   %fract = fsub float %x, %floor.x
@@ -29,7 +29,7 @@ define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
 ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %x.neg = fsub float -0.0, %x
   %floor.x.neg = call float @llvm.floor.f32(float %x.neg)
@@ -45,7 +45,7 @@ define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #
 ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %abs.x = call float @llvm.fabs.f32(float %x)
   %neg.abs.x = fsub float -0.0, %abs.x
@@ -61,7 +61,7 @@ define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %sr
 
 ; GCN-UNSAFE: buffer_store_dword [[FLOOR]]
 ; GCN-UNSAFE: buffer_store_dword [[FRACT]]
-define void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %floor.x = call float @llvm.floor.f32(float %x)
   %fract = fsub float %x, %floor.x
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll
index 039623c02194..9778069d0477 100644
--- a/test/CodeGen/AMDGPU/frem.ll
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs  < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}frem_f32:
 ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
@@ -12,10 +12,10 @@
 ; GCN: v_mul_f32_e32
 ; GCN: v_div_fmas_f32
 ; GCN: v_div_fixup_f32
-; GCN: v_trunc_f32_e32
-; GCN: v_mad_f32
+; GCN: v_trunc_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                       float addrspace(1)* %in2) #0 {
    %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
    %r0 = load float, float addrspace(1)* %in1, align 4
@@ -33,8 +33,7 @@ define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
 ; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
 ; GCN: buffer_store_dword [[RESULT]]
-; GCN: s_endpgm
-define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                              float addrspace(1)* %in2) #1 {
    %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
    %r0 = load float, float addrspace(1)* %in1, align 4
@@ -55,7 +54,7 @@ define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; GCN: v_add_f64
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) #0 {
    %r0 = load double, double addrspace(1)* %in1, align 8
    %r1 = load double, double addrspace(1)* %in2, align 8
@@ -71,7 +70,7 @@ define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; CI: v_trunc_f64_e32
 ; GCN: v_fma_f64
 ; GCN: s_endpgm
-define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                              double addrspace(1)* %in2) #1 {
    %r0 = load double, double addrspace(1)* %in1, align 8
    %r1 = load double, double addrspace(1)* %in2, align 8
@@ -80,7 +79,7 @@ define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in
    ret void
 }
 
-define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                         <2 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
    %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
@@ -90,7 +89,7 @@ define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
    ret void
 }
 
-define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                         <4 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
    %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
@@ -100,7 +99,7 @@ define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
    ret void
 }
 
-define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll
index ed040436a61a..453d8fb37f2f 100644
--- a/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %r0 = load double, double addrspace(1)* %in
   %r1 = call double @llvm.sqrt.f64(double %r0)
   store double %r1, double addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %i
 
 ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f64:
 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
+define amdgpu_kernel void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
   %r0 = load double, double addrspace(1)* %in
   %r1 = call double @llvm.sqrt.f64(double %r0)
   store double %r1, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll
index b6526b8e0787..a0fd3411ca05 100644
--- a/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -7,7 +7,7 @@
 
 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f32:
 ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %r0 = load float, float addrspace(1)* %in
   %r1 = call float @llvm.sqrt.f32(float %r0)
   store float %r1, float addrspace(1)* %out
@@ -16,7 +16,7 @@ define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 
 ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32:
 ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
+define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
   %r0 = load float, float addrspace(1)* %in
   %r1 = call float @llvm.sqrt.f32(float %r0)
   store float %r1, float addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %i
 
 ; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-define void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
 entry:
   %fdiv = call float @llvm.sqrt.f32(float %in)
   store float %fdiv, float addrspace(1)* %out
@@ -44,7 +44,7 @@ entry:
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
 ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-define void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
 ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-define void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
 entry:
   %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
   store <4 x float> %fdiv, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ entry:
 ; FUNC-LABEL: {{^}}elim_redun_check_neg0:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp olt float %in, -0.000000e+00
@@ -87,7 +87,7 @@ entry:
 ; FUNC-LABEL: {{^}}elim_redun_check_pos0:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp olt float %in, 0.000000e+00
@@ -99,7 +99,7 @@ entry:
 ; FUNC-LABEL: {{^}}elim_redun_check_ult:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp ult float %in, -0.000000e+00
@@ -112,7 +112,7 @@ entry:
 ; GCN: v_sqrt_f32_e32
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
@@ -125,7 +125,7 @@ entry:
 ; GCN: v_sqrt_f32_e32
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll
index 0b3c8ac2503d..d3c5df317771 100644
--- a/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -1,17 +1,18 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
 
-; GCN-LABEL: {{^}}fsub_f16
+; GCN-LABEL: {{^}}fsub_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; GFX89:  v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16(
+define amdgpu_kernel void @fsub_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -23,16 +24,15 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fsub_f16_imm_a
+; GCN-LABEL: {{^}}fsub_f16_imm_a:
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3c00{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI:  v_sub_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
+; GFX89:  v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16_imm_a(
+define amdgpu_kernel void @fsub_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -42,16 +42,15 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fsub_f16_imm_b
+; GCN-LABEL: {{^}}fsub_f16_imm_b:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0xc000{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], -2.0, v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
+; GFX89:  v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16_imm_b(
+define amdgpu_kernel void @fsub_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -61,27 +60,33 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fsub_v2f16
+; GCN-LABEL: {{^}}fsub_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_subrev_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16(
+
+define amdgpu_kernel void @fsub_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -93,25 +98,32 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fsub_v2f16_imm_a
+; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x3c00{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4000{{$}}
+
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI:  v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
+; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
+; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16_imm_a(
+
+define amdgpu_kernel void @fsub_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -121,25 +133,32 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fsub_v2f16_imm_b
+; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4000{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x3c00{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
+; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}}
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16_imm_b(
+
+define amdgpu_kernel void @fsub_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll
index 3429df33c015..e7a92d95d485 100644
--- a/test/CodeGen/AMDGPU/fsub.ll
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}v_fsub_f32:
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -17,23 +17,19 @@ define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
 
 ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
   %sub = fsub float %a, %b
   store float %sub, float addrspace(1)* %out, align 4
   ret void
 }
 
-declare float @llvm.r600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 ; FUNC-LABEL: {{^}}fsub_v2f32:
 ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
 ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
 
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
   %sub = fsub <2 x float> %a, %b
   store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -49,7 +45,7 @@ define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
   %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
@@ -64,8 +60,75 @@ define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: s_endpgm
-define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
+define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
   %result = fsub <4 x float> %a, %b
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_fneg_fsub_f32:
+; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
+define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %a = load float, float addrspace(1)* %in, align 4
+  %b = load float, float addrspace(1)* %b_ptr, align 4
+  %result = fsub float %a, %b
+  %neg.result = fsub float -0.0, %result
+  store float %neg.result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32:
+; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI-NOT: xor
+define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %a = load float, float addrspace(1)* %in, align 4
+  %b = load float, float addrspace(1)* %b_ptr, align 4
+  %result = fsub nsz float %a, %b
+  %neg.result = fsub float -0.0, %result
+  store float %neg.result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32:
+; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI-NOT: xor
+define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %a = load float, float addrspace(1)* %in, align 4
+  %b = load float, float addrspace(1)* %b_ptr, align 4
+  %result = fsub float %a, %b
+  %neg.result = fsub float -0.0, %result
+  store float %neg.result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; For some reason the attribute has a string "true" or "false", so
+; make sure it is disabled and the fneg is not folded if it is not
+; "true".
+; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32:
+; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
+define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %a = load float, float addrspace(1)* %in, align 4
+  %b = load float, float addrspace(1)* %b_ptr, align 4
+  %result = fsub float %a, %b
+  %neg.result = fsub float -0.0, %result
+  store float %neg.result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_fsub_0_nsz_attribute_f32:
+; SI-NOT: v_sub
+define amdgpu_kernel void @v_fsub_0_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %a = load float, float addrspace(1)* %in, align 4
+  %result = fsub float %a, 0.0
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" }
+attributes #1 = { nounwind "no-signed-zeros-fp-math"="false" }
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll
index 4c9c5ddd4c6e..1b0879d098ee 100644
--- a/test/CodeGen/AMDGPU/fsub64.ll
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -5,7 +5,7 @@ declare double @llvm.fabs.f64(double) #0
 
 ; SI-LABEL: {{^}}fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -16,7 +16,7 @@ define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; SI-LABEL: {{^}}fsub_fabs_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
-define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                            double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -28,7 +28,7 @@ define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; SI-LABEL: {{^}}fsub_fabs_inv_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                                double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -40,7 +40,7 @@ define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %
 
 ; SI-LABEL: {{^}}s_fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, %b
   store double %sub, double addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
 
 ; SI-LABEL: {{^}}s_fsub_imm_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}, 4.0
-define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double 4.0, %a
   store double %sub, double addrspace(1)* %out
   ret void
@@ -56,7 +56,7 @@ define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
 
 ; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}, -4.0
-define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, 4.0
   store double %sub, double addrspace(1)* %out
   ret void
@@ -64,7 +64,7 @@ define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b)
 
 ; SI-LABEL: {{^}}s_fsub_self_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
+define amdgpu_kernel void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
   %sub = fsub double %a, %a
   store double %sub, double addrspace(1)* %out
   ret void
@@ -73,7 +73,7 @@ define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
 ; SI-LABEL: {{^}}fsub_v2f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
+define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
   %sub = fsub <2 x double> %a, %b
   store <2 x double> %sub, <2 x double> addrspace(1)* %out
   ret void
@@ -84,7 +84,7 @@ define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x d
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
   %a = load <4 x double>, <4 x double> addrspace(1)* %in
   %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr
@@ -98,7 +98,7 @@ define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
+define amdgpu_kernel void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
   %result = fsub <4 x double> %a, %b
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
index c4138ad79c28..1f72ec65588e 100644
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -13,7 +13,7 @@ declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
 ; CI: v_trunc_f64
 ; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
 ; SI: s_endpgm
-define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %x = load double, double addrspace(1)* %in, align 8
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out, align 8
@@ -36,7 +36,7 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; SI-DAG: cndmask_b32
 ; SI-DAG: cndmask_b32
 ; SI: s_endpgm
-define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ftrunc_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -45,7 +45,7 @@ define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}ftrunc_v2f64:
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -55,7 +55,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
-; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
@@ -66,7 +66,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -104,7 +104,7 @@ define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/ftrunc.ll b/test/CodeGen/AMDGPU/ftrunc.ll
index d0718394e7f1..b5ad01eaeaf0 100644
--- a/test/CodeGen/AMDGPU/ftrunc.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.ll
@@ -12,7 +12,7 @@ declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
 ; FUNC-LABEL: {{^}}ftrunc_f32:
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
-define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @ftrunc_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.trunc.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@ define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@ define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
-; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+; define amdgpu_kernel void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
 ;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
 ;   store <3 x float> %y, <3 x float> addrspace(1)* %out
 ;   ret void
@@ -51,7 +51,7 @@ define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
@@ -74,7 +74,7 @@ define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
@@ -113,7 +113,7 @@ define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll
index f96463613e8e..7fb47e08ea58 100644
--- a/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
-define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
+define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: {{^}}use_gep_address_space:
 ; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
 ; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
@@ -17,7 +17,7 @@ define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; SI: s_or_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
-define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
+define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
   %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
   store i32 99, i32 addrspace(3)* %p
   ret void
@@ -39,7 +39,7 @@ define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %arra
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
+define amdgpu_kernel void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
@@ -60,7 +60,7 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
+define amdgpu_kernel void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
   %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll
index 5a18d425d506..80acfcca7082 100644
--- a/test/CodeGen/AMDGPU/global-constant.ll
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@@ -26,7 +26,7 @@
 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4
 
-define void @private_test(i32 %index, float addrspace(1)* %out) {
+define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
   %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
   %val = load float, float addrspace(2)* %ptr
   store float %val, float addrspace(1)* %out
@@ -40,7 +40,7 @@ define void @private_test(i32 %index, float addrspace(1)* %out) {
 ; HSA: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4
-define void @available_externally_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1
   %val = load i32, i32 addrspace(2)* %ptr
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/global-directive.ll b/test/CodeGen/AMDGPU/global-directive.ll
index 450b7d367429..ce89e390eac1 100644
--- a/test/CodeGen/AMDGPU/global-directive.ll
+++ b/test/CodeGen/AMDGPU/global-directive.ll
@@ -5,7 +5,7 @@
 
 ; SI:	.globl	foo
 ; SI: {{^}}foo:
-define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
index 2c7c02de1673..19e592f50bea 100644
--- a/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_load_ushort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@ define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; SI: buffer_load_sshort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_ushort
 ; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_sshort
 ; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -74,7 +74,7 @@ define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -83,7 +83,7 @@ define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -101,7 +101,7 @@ define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -110,7 +110,7 @@ define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -119,7 +119,7 @@ define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -128,7 +128,7 @@ define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -146,7 +146,7 @@ define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -157,7 +157,7 @@ define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -168,7 +168,7 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
 ; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0
-define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -186,7 +186,7 @@ define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -195,7 +195,7 @@ define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -204,7 +204,7 @@ define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -213,7 +213,7 @@ define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -222,7 +222,7 @@ define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -231,7 +231,7 @@ define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -240,7 +240,7 @@ define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -258,7 +258,7 @@ define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -267,7 +267,7 @@ define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -276,7 +276,7 @@ define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -285,7 +285,7 @@ define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -294,7 +294,7 @@ define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/global-variable-relocs.ll b/test/CodeGen/AMDGPU/global-variable-relocs.ll
index 00be6e4d5c15..ae6dd54fec6c 100644
--- a/test/CodeGen/AMDGPU/global-variable-relocs.ll
+++ b/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -19,7 +19,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @private_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -33,7 +33,7 @@ define void @private_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @internal_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -50,7 +50,7 @@ define void @internal_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @available_externally_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -67,7 +67,7 @@ define void @available_externally_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @linkonce_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @linkonce_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @weak_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -101,7 +101,7 @@ define void @weak_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @common_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -118,7 +118,7 @@ define void @common_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @extern_weak_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -135,7 +135,7 @@ define void @extern_weak_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @linkonce_odr_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @linkonce_odr_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @weak_odr_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -169,7 +169,7 @@ define void @weak_odr_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @external_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -186,7 +186,7 @@ define void @external_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @external_w_init_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll
index 909ceb5546c6..6928bede547e 100644
--- a/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -25,7 +25,7 @@ entry:
 ; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd
 ; SI: buffer_atomic_add v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add
-define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
 
@@ -36,7 +36,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
 ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -47,7 +47,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -59,7 +59,7 @@ entry:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -70,7 +70,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_add_i32:
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -79,7 +79,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret:
 ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -89,7 +89,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -100,7 +100,7 @@ entry:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -110,7 +110,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
 ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -120,7 +120,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
 ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -131,7 +131,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -143,7 +143,7 @@ entry:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -154,7 +154,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32:
 ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -163,7 +163,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret:
 ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -173,7 +173,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -184,7 +184,7 @@ entry:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -194,7 +194,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
 ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -204,7 +204,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
 ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -215,7 +215,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -227,7 +227,7 @@ entry:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -238,7 +238,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32:
 ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -247,7 +247,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
 ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -257,7 +257,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -268,7 +268,7 @@ entry:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -278,7 +278,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -288,7 +288,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
 ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -299,7 +299,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -311,7 +311,7 @@ entry:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -322,7 +322,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -331,7 +331,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret:
 ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -341,7 +341,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -352,7 +352,7 @@ entry:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -362,7 +362,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
 ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -372,7 +372,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
 ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -383,7 +383,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -395,7 +395,7 @@ entry:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -406,7 +406,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32:
 ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -415,7 +415,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
 ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -425,7 +425,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -436,7 +436,7 @@ entry:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -446,7 +446,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
 ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -456,7 +456,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
 ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -467,7 +467,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -479,7 +479,7 @@ entry:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -490,7 +490,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32:
 ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -499,7 +499,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret:
 ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -509,7 +509,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -520,7 +520,7 @@ entry:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -530,7 +530,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
 ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -540,7 +540,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
 ; GCN: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -551,7 +551,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -563,7 +563,7 @@ entry:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -574,7 +574,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32:
 ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -583,7 +583,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -593,7 +593,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -604,7 +604,7 @@ entry:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -614,7 +614,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
 ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -624,7 +624,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
 ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -635,7 +635,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -647,7 +647,7 @@ entry:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -658,7 +658,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32:
 ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -667,7 +667,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret:
 ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -677,7 +677,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -688,7 +688,7 @@ entry:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -698,7 +698,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
 ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -708,7 +708,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -720,7 +720,7 @@ entry:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -733,7 +733,7 @@ entry:
 
 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -744,7 +744,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32:
 ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -753,7 +753,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
 ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -763,7 +763,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -774,7 +774,7 @@ entry:
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -784,7 +784,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -794,7 +794,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
 ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -807,7 +807,7 @@ entry:
 ; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 
 ; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -819,7 +819,7 @@ entry:
 ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -831,7 +831,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
 ; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -840,7 +840,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret:
 ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
@@ -851,7 +851,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -862,7 +862,7 @@ entry:
 ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -873,7 +873,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
 ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -883,7 +883,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
 ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -894,7 +894,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -906,7 +906,7 @@ entry:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -917,7 +917,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32:
 ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -926,7 +926,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
 ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -936,7 +936,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -947,7 +947,7 @@ entry:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -959,7 +959,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4
   %val = load atomic i32, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -971,7 +971,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
   store i32 %val, i32 addrspace(1)* %out
@@ -982,7 +982,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -995,7 +995,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
   %val = load atomic i32, i32 addrspace(1)* %ptr seq_cst, align 4
@@ -1006,7 +1006,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_offset:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -1016,7 +1016,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
   ret void
@@ -1025,7 +1025,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -1036,7 +1036,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
 ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
diff --git a/test/CodeGen/AMDGPU/global_atomics_i64.ll b/test/CodeGen/AMDGPU/global_atomics_i64.ll
index f66c6c7b531a..56520b787ead 100644
--- a/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
 ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -24,7 +24,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
 ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -36,7 +36,7 @@ entry:
 ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -47,7 +47,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -56,7 +56,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret:
 ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -66,7 +66,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64:
 ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -77,7 +77,7 @@ entry:
 ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -87,7 +87,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -97,7 +97,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
 ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -108,7 +108,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
 ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -120,7 +120,7 @@ entry:
 ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -131,7 +131,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -140,7 +140,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret:
 ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -150,7 +150,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64:
 ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -161,7 +161,7 @@ entry:
 ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -171,7 +171,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -181,7 +181,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
 ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -192,7 +192,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
 ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -204,7 +204,7 @@ entry:
 ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -215,7 +215,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -224,7 +224,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret:
 ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -234,7 +234,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
 ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -245,7 +245,7 @@ entry:
 ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -255,7 +255,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -265,7 +265,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
 ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -276,7 +276,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
 ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -288,7 +288,7 @@ entry:
 ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -299,7 +299,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -308,7 +308,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret:
 ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -318,7 +318,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64:
 ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -329,7 +329,7 @@ entry:
 ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -339,7 +339,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -349,7 +349,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
 ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -360,7 +360,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
 ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -372,7 +372,7 @@ entry:
 ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -392,7 +392,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret:
 ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -402,7 +402,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
 ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -433,7 +433,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
 ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -444,7 +444,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
 ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -456,7 +456,7 @@ entry:
 ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -467,7 +467,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -476,7 +476,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret:
 ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -486,7 +486,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64:
 ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -497,7 +497,7 @@ entry:
 ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -507,7 +507,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -517,7 +517,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
 ; GCN: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -528,7 +528,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
 ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -540,7 +540,7 @@ entry:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -551,7 +551,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -560,7 +560,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -570,7 +570,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
 ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -581,7 +581,7 @@ entry:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -591,7 +591,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -601,7 +601,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
 ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -612,7 +612,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
 ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -624,7 +624,7 @@ entry:
 ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -635,7 +635,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -644,7 +644,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret:
 ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -654,7 +654,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64:
 ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -665,7 +665,7 @@ entry:
 ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -675,7 +675,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -685,7 +685,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -696,7 +696,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
 ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -708,7 +708,7 @@ entry:
 ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -719,7 +719,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -728,7 +728,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
 ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -738,7 +738,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
 ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -749,7 +749,7 @@ entry:
 ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -759,7 +759,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -769,7 +769,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
 ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -780,7 +780,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
 ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -792,7 +792,7 @@ entry:
 ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -803,7 +803,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -812,7 +812,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret:
 ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -822,7 +822,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
 ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -833,7 +833,7 @@ entry:
 ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -851,7 +851,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -861,7 +861,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x11940
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-define void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -871,7 +871,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
 ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -884,7 +884,7 @@ entry:
 ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 
 ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -896,7 +896,7 @@ entry:
 ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -908,7 +908,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64:
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
@@ -917,7 +917,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret:
 ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -928,7 +928,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
 ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -939,7 +939,7 @@ entry:
 ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -952,7 +952,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4
   %val = load atomic i64, i64 addrspace(1)* %gep  seq_cst, align 8
@@ -964,7 +964,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8
   store i64 %val, i64 addrspace(1)* %out
@@ -975,7 +975,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -988,7 +988,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
   %val = load atomic i64, i64 addrspace(1)* %ptr seq_cst, align 8
@@ -999,7 +999,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64_offset:
 ; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   store atomic i64 %in, i64 addrspace(1)* %gep  seq_cst, align 8
@@ -1009,7 +1009,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
-define void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8
   ret void
@@ -1018,7 +1018,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64_addr64_offset:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -1029,7 +1029,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64_addr64:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   store atomic i64 %in, i64 addrspace(1)* %ptr seq_cst, align 8
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index d07843e9dd27..0903542bac4f 100644
--- a/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -15,7 +15,7 @@
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @float(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
   %1 = load float, float addrspace(2)* %0
@@ -33,7 +33,7 @@ entry:
 ; EG: @i32_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @i32(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
   %1 = load i32, i32 addrspace(2)* %0
@@ -53,7 +53,7 @@ entry:
 ; EG: @struct_foo_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
   %load = load i32, i32 addrspace(2)* %gep, align 4
   store i32 %load, i32 addrspace(1)* %out, align 4
@@ -72,7 +72,7 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
 ; EG: @array_v1_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
   %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
   store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
@@ -84,7 +84,7 @@ define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
 ; EG: VTX_READ_32
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
-define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+define amdgpu_kernel void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
 entry:
   %0 = icmp eq i32 0, %a
   br i1 %0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/gv-offset-folding.ll b/test/CodeGen/AMDGPU/gv-offset-folding.ll
index af5ee8e66750..e641d7266a79 100644
--- a/test/CodeGen/AMDGPU/gv-offset-folding.ll
+++ b/test/CodeGen/AMDGPU/gv-offset-folding.ll
@@ -12,8 +12,8 @@
 ; for local memory globals.
 
 ; CHECK-LABEL: lds_no_offset:
-; CHECK ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4
-define void @lds_no_offset() {
+; CHECK: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4
+define amdgpu_kernel void @lds_no_offset() {
 entry:
   %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1
   store i32 0, i32 addrspace(3)* %ptr
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
index aa22e83fade2..41ae5a4a0b00 100644
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -8,7 +8,7 @@
 ; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
 ; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]]
 ; GCN: buffer_store_short [[CVT]]
-define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
   store half %arg, half addrspace(1)* %out
   ret void
 }
@@ -20,7 +20,7 @@ define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
 ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: s_endpgm
-define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
   store <2 x half> %arg, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -34,7 +34,7 @@ define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
 ; GCN-DAG: buffer_store_short
 ; GCN-NOT: buffer_store
 ; GCN: s_endpgm
-define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
   store <3 x half> %arg, <3 x half> addrspace(1)* %out
   ret void
 }
@@ -46,33 +46,33 @@ define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
   store <4 x half> %arg, <4 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_v8f16_arg:
-define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
   store <8 x half> %arg, <8 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_arg:
-define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
   %fpext = fpext <2 x half> %in to <2 x float>
   store <2 x float> %fpext, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
-define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to float
   store float %ext, float addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
-define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x float>
   store <2 x float> %ext, <2 x float> addrspace(1)* %out
   ret void
@@ -90,14 +90,14 @@ define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x hal
 ; GCN-DAG: buffer_store_dword
 ; GCN-DAG: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x float>
   store <3 x float> %ext, <3 x float> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
-define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x float>
   store <4 x float> %ext, <4 x float> addrspace(1)* %out
   ret void
@@ -124,7 +124,7 @@ define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x hal
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x float>
   store <8 x float> %ext, <8 x float> addrspace(1)* %out
   ret void
@@ -138,7 +138,7 @@ define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x hal
 ; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]]
 ; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to double
   store double %ext, double addrspace(1)* %out
   ret void
@@ -152,7 +152,7 @@ define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x double>
   store <2 x double> %ext, <2 x double> addrspace(1)* %out
   ret void
@@ -169,7 +169,7 @@ define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x ha
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x double>
   store <3 x double> %ext, <3 x double> addrspace(1)* %out
   ret void
@@ -189,7 +189,7 @@ define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x ha
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x double>
   store <4 x double> %ext, <4 x double> addrspace(1)* %out
   ret void
@@ -227,7 +227,7 @@ define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x ha
 ; GCN-DAG: v_cvt_f64_f32_e32
 
 ; GCN: s_endpgm
-define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x double>
   store <8 x double> %ext, <8 x double> addrspace(1)* %out
   ret void
@@ -236,7 +236,7 @@ define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x ha
 ; GCN-LABEL: {{^}}global_load_store_f16:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   store half %val, half addrspace(1)* %out
   ret void
@@ -245,7 +245,7 @@ define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %
 ; GCN-LABEL: {{^}}global_load_store_v2f16:
 ; GCN: buffer_load_dword [[TMP:v[0-9]+]]
 ; GCN: buffer_store_dword [[TMP]]
-define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   store <2 x half> %val, <2 x half> addrspace(1)* %out
   ret void
@@ -254,7 +254,7 @@ define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> a
 ; GCN-LABEL: {{^}}global_load_store_v4f16:
 ; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx2 [[TMP]]
-define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   store <4 x half> %val, <4 x half> addrspace(1)* %out
   ret void
@@ -264,7 +264,7 @@ define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> ad
 ; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: s_endpgm
-define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   store <8 x half> %val, <8 x half> addrspace(1)* %out
   ret void
@@ -274,7 +274,7 @@ define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> a
 ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
 ; GCN: buffer_store_dword [[CVT]]
-define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %cvt = fpext half %val to float
   store float %cvt, float addrspace(1)* %out
@@ -283,13 +283,13 @@ define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; VI:  v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
 ; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
-; SI:  v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
-; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
 ; GCN: s_endpgm
-define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out
@@ -297,7 +297,7 @@ define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
-define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
   %cvt = fpext <3 x half> %val to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out
@@ -305,7 +305,7 @@ define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
-define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   %cvt = fpext <4 x half> %val to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out
@@ -313,7 +313,7 @@ define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
-define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   %cvt = fpext <8 x half> %val to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out
@@ -324,22 +324,26 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx4
 
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+
+; VI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
+; ...
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
@@ -347,7 +351,7 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
 ; GCN: buffer_store_dwordx4
 
 ; GCN: s_endpgm
-define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x float>
   store <16 x float> %cvt, <16 x float> addrspace(1)* %out
@@ -359,7 +363,7 @@ define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <1
 ; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
 ; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
 ; GCN: buffer_store_dwordx2 [[CVT1]]
-define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %cvt = fpext half %val to double
   store double %cvt, double addrspace(1)* %out
@@ -368,14 +372,21 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
 ; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+
+; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
+; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+
+; VI-DAG: v_cvt_f32_f16_sdwa v[[CVT0:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD]]
+; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT0]]
+; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x double>
   store <2 x double> %cvt, <2 x double> addrspace(1)* %out
@@ -392,28 +403,27 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
 ; XSI-NOT: v_cvt_f32_f16
 
 ; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
-; XVI: v_cvt_f32_f16_e32
 ; XVI: v_cvt_f32_f16_e32
 ; XVI: v_cvt_f32_f16_e32
+; XVI: v_cvt_f32_f16_sdwa
 ; XVI-NOT: v_cvt_f32_f16
 
 ; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
-; VI:  v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
-; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
-; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
-; SI:  v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
-; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
-
-; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
-; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
-; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
+; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
+; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
+; SI:      v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
+; SI-DAG:  v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
+; VI-DAG:  v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+
+; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
+; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
 ; GCN-NOT: v_cvt_f64_f32_e32
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN: s_endpgm
-define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
   %cvt = fpext <3 x half> %val to <3 x double>
   store <3 x double> %cvt, <3 x double> addrspace(1)* %out
@@ -421,7 +431,7 @@ define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
-define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   %cvt = fpext <4 x half> %val to <4 x double>
   store <4 x double> %cvt, <4 x double> addrspace(1)* %out
@@ -429,7 +439,7 @@ define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
-define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   %cvt = fpext <8 x half> %val to <8 x double>
   store <8 x double> %cvt, <8 x double> addrspace(1)* %out
@@ -437,7 +447,7 @@ define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
-define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x double>
   store <16 x double> %cvt, <16 x double> addrspace(1)* %out
@@ -448,7 +458,7 @@ define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
 ; GCN: buffer_store_short [[CVT]]
-define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %val = load float, float addrspace(1)* %in
   %cvt = fptrunc float %val to half
   store half %cvt, half addrspace(1)* %out
@@ -458,12 +468,17 @@ define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspa
 ; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
 ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
-; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
-; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
-; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
+
+; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
+; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
+; SI:     v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
+
+; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI:     v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]]
+
 ; GCN-DAG: buffer_store_dword [[PACKED]]
 ; GCN: s_endpgm
-define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %val = load <2 x float>, <2 x float> addrspace(1)* %in
   %cvt = fptrunc <2 x float> %val to <2 x half>
   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
@@ -472,14 +487,14 @@ define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2
 
 ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
 ; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN-NOT: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; SI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; GCN-DAG: v_cvt_f16_f32_e32
 ; GCN: buffer_store_short
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %val = load <3 x float>, <3 x float> addrspace(1)* %in
   %cvt = fptrunc <3 x float> %val to <3 x half>
   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
@@ -488,13 +503,15 @@ define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3
 
 ; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
 ; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; SI-DAG:  v_cvt_f16_f32_e32
+; SI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; GCN-DAG: v_cvt_f16_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %val = load <4 x float>, <4 x float> addrspace(1)* %in
   %cvt = fptrunc <4 x float> %val to <4 x half>
   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
@@ -504,17 +521,25 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4
 ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; VI-DAG:  v_cvt_f16_f32_sdwa
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
   %val = load <8 x float>, <8 x float> addrspace(1)* %in
   %cvt = fptrunc <8 x float> %val to <8 x half>
   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
@@ -545,7 +570,7 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8
 ; GCN-DAG: buffer_store_dwordx4
 ; GCN-DAG: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
   %val = load <16 x float>, <16 x float> addrspace(1)* %in
   %cvt = fptrunc <16 x float> %val to <16 x half>
   store <16 x half> %cvt, <16 x half> addrspace(1)* %out
@@ -560,7 +585,7 @@ define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out,
 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
+define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
    %add = fadd half %a, %b
    store half %add, half addrspace(1)* %out, align 4
    ret void
@@ -570,7 +595,7 @@ define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
   %add = fadd <2 x half> %a, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
   ret void
@@ -582,7 +607,7 @@ define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half>
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
   %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
   %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
@@ -601,7 +626,7 @@ define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)*
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
   %add = fadd <8 x half> %a, %b
   store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
   ret void
@@ -610,7 +635,7 @@ define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half>
 ; GCN-LABEL: {{^}}test_bitcast_from_half:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
   %val = load half, half addrspace(1)* %in
   %val_int = bitcast half %val to i16
   store i16 %val_int, i16 addrspace(1)* %out
@@ -620,7 +645,7 @@ define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_bitcast_to_half:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %val = load i16, i16 addrspace(1)* %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, half addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/hsa-default-device.ll b/test/CodeGen/AMDGPU/hsa-default-device.ll
index 631d6def4442..45efe9b86557 100644
--- a/test/CodeGen/AMDGPU/hsa-default-device.ll
+++ b/test/CodeGen/AMDGPU/hsa-default-device.ll
@@ -4,7 +4,7 @@
 ; unsupported device.
 
 ; CHECK: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
-define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
   store float 0.0, float addrspace(1)* %out0
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
index 51d6aee25f45..b1901cf894b0 100644
--- a/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -4,7 +4,7 @@
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+define amdgpu_kernel void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -14,7 +14,7 @@ define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -24,7 +24,7 @@ define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -34,7 +34,7 @@ define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN: float_mode = 48
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -44,7 +44,7 @@ define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -54,7 +54,17 @@ define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(
 ; GCN: float_mode = 0
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_dx10_clamp_vi:
+; GCN: float_mode = 192
+; GCN: enable_dx10_clamp = 0
+; GCN: enable_ieee_mode = 1
+define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -62,7 +72,8 @@ define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %
 
 attributes #0 = { nounwind "target-cpu"="kaveri" }
 attributes #1 = { nounwind "target-cpu"="fiji" }
-attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-denormals" }
-attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-denormals" }
-attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" }
-attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" }
+attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" }
+attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" }
+attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
+attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
+attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" }
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index d9662b69b126..b4cdd4030d86 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -26,7 +26,7 @@
 
 ; ELF: Symbol {
 ; ELF: Name: simple
-; ELF: Size: 288
+; ELF: Size: 292
 ; ELF: Type: Function (0x2)
 ; ELF: }
 
diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll
index 2820b308edb8..2ec57a40f0a2 100644
--- a/test/CodeGen/AMDGPU/hsa-globals.ll
+++ b/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -9,7 +9,7 @@
 @internal_readonly = internal unnamed_addr addrspace(2) constant i32 0
 @external_readonly = unnamed_addr addrspace(2) constant i32 0
 
-define void @test() {
+define amdgpu_kernel void @test() {
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/hsa-group-segment.ll b/test/CodeGen/AMDGPU/hsa-group-segment.ll
index 1999dc38a6b0..600793810e59 100644
--- a/test/CodeGen/AMDGPU/hsa-group-segment.ll
+++ b/test/CodeGen/AMDGPU/hsa-group-segment.ll
@@ -3,7 +3,7 @@
 @internal_group = internal addrspace(3) global i32 undef
 @external_group = addrspace(3) global i32 undef
 
-define void @test() {
+define amdgpu_kernel void @test() {
 entry:
   store i32 0, i32 addrspace(3)* @internal_group
   store i32 0, i32 addrspace(3)* @external_group
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index a4e599230b74..af63a4f8df76 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -13,6 +13,8 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx804 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI804 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx901 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX901 %s
 
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-CI700: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
@@ -24,3 +26,5 @@
 ; HSA-VI803: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
 ; HSA-VI804: .hsa_code_object_isa 8,0,4,"AMD","AMDGPU"
 ; HSA-VI810: .hsa_code_object_isa 8,1,0,"AMD","AMDGPU"
+; HSA-GFX900: .hsa_code_object_isa 9,0,0,"AMD","AMDGPU"
+; HSA-GFX901: .hsa_code_object_isa 9,0,1,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index 12c15441c0f5..972fbd66ef37 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -45,6 +45,8 @@
 ; HSA: .amd_kernel_code_t
 ; HSA: enable_sgpr_private_segment_buffer = 1
 ; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: wavefront_size = 6
+; HSA: call_convention = -1
 ; HSA: .end_amd_kernel_code_t
 ; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 
diff --git a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
index e85db65e7429..f6bf0b09486e 100644
--- a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -5,7 +5,7 @@
 ; SI-LABEL: {{^}}br_implicit_def:
 ; SI: BB#0:
 ; SI-NEXT: s_cbranch_scc1
-define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
+define amdgpu_kernel void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
 bb:
   br i1 undef, label %bb1, label %bb2
 
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index d4912776debd..b160af86a2b6 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -10,7 +10,7 @@
 ; SI: s_and_saveexec_b64
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @br_i1_phi(i32 %arg) {
+define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
   %tidig = call i32 @llvm.r600.read.tidig.x() #0
   %cmp = trunc i32 %tidig to i1
diff --git a/test/CodeGen/AMDGPU/i8-to-double-to-float.ll b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
index c218e1918bb0..d501be5c8bf0 100644
--- a/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
+++ b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
@@ -2,7 +2,7 @@
 
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8, i8 addrspace(1)* %in
   %2 = uitofp i8 %1 to double
   %3 = fptrunc double %2 to float
diff --git a/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
index 60e59a5a5286..12cc440e48d9 100644
--- a/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
+++ b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
@@ -6,7 +6,7 @@
 ;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK-NOT: SETNE_INT
 
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32, i32 addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/icmp.i16.ll b/test/CodeGen/AMDGPU/icmp.i16.ll
index c3dad2d32033..99c2138bbe64 100644
--- a/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -8,7 +8,7 @@
 ; GCN-LABEL: {{^}}i16_eq:
 ; VI: v_cmp_eq_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -26,7 +26,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ne:
 ; VI: v_cmp_ne_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -44,7 +44,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ugt:
 ; VI: v_cmp_gt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -62,7 +62,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_uge:
 ; VI: v_cmp_ge_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -80,7 +80,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ult:
 ; VI: v_cmp_lt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -98,7 +98,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ule:
 ; VI: v_cmp_le_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -117,7 +117,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sgt:
 ; VI: v_cmp_gt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -135,7 +135,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sge:
 ; VI: v_cmp_ge_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_slt:
 ; VI: v_cmp_lt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -171,7 +171,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sle:
 ; VI: v_cmp_le_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -190,7 +190,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_eq_v_s:
 ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -206,7 +206,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ne_v_s:
 ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -222,7 +222,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ugt_v_s:
 ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -238,7 +238,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_uge_v_s:
 ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -254,7 +254,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ult_v_s:
 ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -270,7 +270,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ule_v_s:
 ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -286,7 +286,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sgt_v_s:
 ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -302,7 +302,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sge_v_s:
 ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -318,7 +318,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_slt_v_s:
 ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -334,7 +334,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sle_v_s:
 ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/test/CodeGen/AMDGPU/icmp64.ll b/test/CodeGen/AMDGPU/icmp64.ll
index 33ad0c9199b9..3af74277df12 100644
--- a/test/CodeGen/AMDGPU/icmp64.ll
+++ b/test/CodeGen/AMDGPU/icmp64.ll
@@ -3,7 +3,7 @@
 
 ; SI-LABEL: {{^}}test_i64_eq:
 ; SI: v_cmp_eq_u64
-define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp eq i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -12,7 +12,7 @@ define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ne:
 ; SI: v_cmp_ne_u64
-define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ne i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -21,7 +21,7 @@ define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_slt:
 ; SI: v_cmp_lt_i64
-define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp slt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -30,7 +30,7 @@ define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ult:
 ; SI: v_cmp_lt_u64
-define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ult i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -39,7 +39,7 @@ define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_sle:
 ; SI: v_cmp_le_i64
-define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sle i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -48,7 +48,7 @@ define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ule:
 ; SI: v_cmp_le_u64
-define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ule i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -57,7 +57,7 @@ define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_sgt:
 ; SI: v_cmp_gt_i64
-define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sgt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -66,7 +66,7 @@ define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ugt:
 ; SI: v_cmp_gt_u64
-define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -75,7 +75,7 @@ define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_sge:
 ; SI: v_cmp_ge_i64
-define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sge i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -84,7 +84,7 @@ define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_uge:
 ; SI: v_cmp_ge_u64
-define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp uge i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
new file mode 100644
index 000000000000..6e411ce5e017
--- /dev/null
+++ b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
@@ -0,0 +1,45 @@
+; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+
+; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v1 to s9
+
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_i32() #0 {
+  %vgpr = call i32 asm sideeffect "; def $0", "=${VGPR1}"()
+  call void asm sideeffect "; use $0", "${SGPR9}"(i32 %vgpr)
+  ret void
+}
+
+; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v[0:1] to s[10:11]
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v2i32() #0 {
+  %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1}"()
+  call void asm sideeffect "; use $0", "${SGPR10_SGPR11}"(<2 x i32> %vgpr)
+  ret void
+}
+
+; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v4i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v[0:3] to s[8:11]
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v4i32() #0 {
+  %vgpr = call <4 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3}"()
+  call void asm sideeffect "; use $0", "${SGPR8_SGPR9_SGPR10_SGPR11}"(<4 x i32> %vgpr)
+  ret void
+}
+
+; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v8i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v[0:7] to s[8:15]
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v8i32() #0 {
+  %vgpr = call <8 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}"()
+  call void asm sideeffect "; use $0", "${SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}"(<8 x i32> %vgpr)
+  ret void
+}
+
+; ERR error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v16i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v[0:15] to s[16:31]
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 {
+  %vgpr = call <16 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}"()
+  call void asm sideeffect "; use $0", "${SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23_SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}"(<16 x i32> %vgpr)
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/image-attributes.ll b/test/CodeGen/AMDGPU/image-attributes.ll
index 5906b2f15709..53d61e66c6ba 100644
--- a/test/CodeGen/AMDGPU/image-attributes.ll
+++ b/test/CodeGen/AMDGPU/image-attributes.ll
@@ -7,7 +7,7 @@
 ; FUNC-LABEL: {{^}}width_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @width_2d (%opencl.image2d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
@@ -20,7 +20,7 @@ entry:
 ; FUNC-LABEL: {{^}}width_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @width_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -37,7 +37,7 @@ entry:
 ; FUNC-LABEL: {{^}}height_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @height_2d (%opencl.image2d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
@@ -50,7 +50,7 @@ entry:
 ; FUNC-LABEL: {{^}}height_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @height_3d (%opencl.image3d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -67,7 +67,7 @@ entry:
 ; FUNC-LABEL: {{^}}depth_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].X
-define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -84,7 +84,7 @@ entry:
 ; FUNC-LABEL: {{^}}data_type_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
                            i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
@@ -97,7 +97,7 @@ entry:
 ; FUNC-LABEL: {{^}}data_type_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
                                      i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
@@ -114,7 +114,7 @@ entry:
 ; FUNC-LABEL: {{^}}channel_order_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
                                i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
@@ -127,7 +127,7 @@ entry:
 ; FUNC-LABEL: {{^}}channel_order_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
                                          i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
@@ -146,7 +146,7 @@ entry:
 ; FUNC-LABEL: {{^}}image_arg_2nd:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[4].Z
-define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
+define amdgpu_kernel void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
                             i32 %x,
                             %opencl.image2d_t addrspace(1)* %in2,
                             i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/image-resource-id.ll b/test/CodeGen/AMDGPU/image-resource-id.ll
index d4cf34944240..dac7c7ddaeac 100644
--- a/test/CodeGen/AMDGPU/image-resource-id.ll
+++ b/test/CodeGen/AMDGPU/image-resource-id.ll
@@ -7,7 +7,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
+define amdgpu_kernel void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
@@ -21,7 +21,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
+define amdgpu_kernel void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
@@ -37,7 +37,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
+define amdgpu_kernel void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
@@ -51,7 +51,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
+define amdgpu_kernel void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
@@ -67,7 +67,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -82,7 +82,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -97,7 +97,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -112,7 +112,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -129,7 +129,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -144,7 +144,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -159,7 +159,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -174,7 +174,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -191,7 +191,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             %opencl.image2d_t addrspace(1)* %in3, ; read_only
                             i32 addrspace(1)* %out) {
@@ -208,7 +208,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             %opencl.image3d_t addrspace(1)* %in3, ; read_only
                             i32 addrspace(1)* %out) {
@@ -226,7 +226,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             %opencl.image2d_t addrspace(1)* %in3, ; write_only
                             i32 addrspace(1)* %out) {
@@ -243,7 +243,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             %opencl.image3d_t addrspace(1)* %in3, ; write_only
                             i32 addrspace(1)* %out) {
@@ -261,7 +261,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                              %opencl.image3d_t addrspace(1)* %in2, ; read_only
                              %opencl.image2d_t addrspace(1)* %in3, ; read_only
                              i32 addrspace(1)* %out) {
@@ -277,7 +277,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                              %opencl.image2d_t addrspace(1)* %in2, ; read_only
                              %opencl.image3d_t addrspace(1)* %in3, ; read_only
                              i32 addrspace(1)* %out) {
@@ -293,7 +293,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                              %opencl.image3d_t addrspace(1)* %in2, ; read_only
                              %opencl.image2d_t addrspace(1)* %in3, ; write_only
                              i32 addrspace(1)* %out) {
@@ -309,7 +309,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                              %opencl.image2d_t addrspace(1)* %in2, ; read_only
                              %opencl.image3d_t addrspace(1)* %in3, ; write_only
                              i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
index ef6008aa5fde..c2668a077b09 100644
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}i64_imm_inline_lo:
 ; GCN: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
-define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 entry:
   store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
   ret void
@@ -15,7 +15,7 @@ entry:
 ; GCN-LABEL: {{^}}i64_imm_inline_hi:
 ; GCN: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
-define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 entry:
   store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
   ret void
@@ -25,7 +25,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
   store i64 -9223372036854775808, i64 addrspace(1) *%out
   ret void
 }
@@ -33,7 +33,7 @@ define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
   store float 0.0, float addrspace(1)* %out
   ret void
 }
@@ -49,7 +49,7 @@ define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_imm_neg_0.0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
   store float -0.0, float addrspace(1)* %out
   ret void
 }
@@ -57,7 +57,7 @@ define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
   store float 0.5, float addrspace(1)* %out
   ret void
 }
@@ -65,7 +65,7 @@ define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
   store float -0.5, float addrspace(1)* %out
   ret void
 }
@@ -73,7 +73,7 @@ define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
   store float 1.0, float addrspace(1)* %out
   ret void
 }
@@ -81,7 +81,7 @@ define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
   store float -1.0, float addrspace(1)* %out
   ret void
 }
@@ -89,7 +89,7 @@ define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
   store float 2.0, float addrspace(1)* %out
   ret void
 }
@@ -97,7 +97,7 @@ define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
   store float -2.0, float addrspace(1)* %out
   ret void
 }
@@ -105,7 +105,7 @@ define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
   store float 4.0, float addrspace(1)* %out
   ret void
 }
@@ -113,7 +113,7 @@ define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
   store float -4.0, float addrspace(1)* %out
   ret void
 }
@@ -123,7 +123,7 @@ define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e22f983{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0.15915494{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
   store float 0x3FC45F3060000000, float addrspace(1)* %out
   ret void
 }
@@ -131,7 +131,7 @@ define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f32:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbe22f983{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
   store float 0xBFC45F3060000000, float addrspace(1)* %out
   ret void
 }
@@ -139,7 +139,7 @@ define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_literal_imm_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
 ; GCN: buffer_store_dword [[REG]]
-define void @store_literal_imm_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) {
   store float 4096.0, float addrspace(1)* %out
   ret void
 }
@@ -148,7 +148,7 @@ define void @store_literal_imm_f32(float addrspace(1)* %out) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@ define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
   ret void
@@ -168,7 +168,7 @@ define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -0.5
   store float %y, float addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@ define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 1.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -188,7 +188,7 @@ define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -1.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -198,7 +198,7 @@ define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 2.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -208,7 +208,7 @@ define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -2.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -218,7 +218,7 @@ define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 4.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -228,7 +228,7 @@ define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -4.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -238,7 +238,7 @@ define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
 ; GCN: buffer_store_dword [[REG]]
-define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addr
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
 ; GCN: buffer_store_dword [[REG]]
-define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 1024.0
   store float %y, float addrspace(1)* %out
@@ -260,7 +260,7 @@ define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36a0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -270,7 +270,7 @@ define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36b0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -280,7 +280,7 @@ define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 16
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36e0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -290,7 +290,7 @@ define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffe0000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -300,7 +300,7 @@ define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffc0000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -310,7 +310,7 @@ define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -16
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xfffffffe00000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -320,7 +320,7 @@ define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 63
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36ff800000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -330,7 +330,7 @@ define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 64
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x3700000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -342,7 +342,7 @@ define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -353,7 +353,7 @@ define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.5
   store double %y, double addrspace(1)* %out
   ret void
@@ -364,7 +364,7 @@ define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -0.5
   store double %y, double addrspace(1)* %out
   ret void
@@ -375,7 +375,7 @@ define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 1.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -386,7 +386,7 @@ define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -1.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -397,7 +397,7 @@ define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 2.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -408,7 +408,7 @@ define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -2.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -419,7 +419,7 @@ define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 4.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -430,7 +430,7 @@ define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -4.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -445,7 +445,7 @@ define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
 ; VI: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x3fc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -455,7 +455,7 @@ define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xbfc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -466,7 +466,7 @@ define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000001
   store double %y, double addrspace(1)* %out
   ret void
@@ -477,7 +477,7 @@ define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000002
   store double %y, double addrspace(1)* %out
   ret void
@@ -488,7 +488,7 @@ define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000010
   store double %y, double addrspace(1)* %out
   ret void
@@ -499,7 +499,7 @@ define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xffffffffffffffff
   store double %y, double addrspace(1)* %out
   ret void
@@ -510,7 +510,7 @@ define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffffe
   store double %y, double addrspace(1)* %out
   ret void
@@ -521,7 +521,7 @@ define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffff0
   store double %y, double addrspace(1)* %out
   ret void
@@ -532,7 +532,7 @@ define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x000000000000003F
   store double %y, double addrspace(1)* %out
   ret void
@@ -543,7 +543,7 @@ define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000040
   store double %y, double addrspace(1)* %out
   ret void
@@ -554,7 +554,7 @@ define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
 ; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
 ; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], v[[LO_VREG]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
   store double 0.0, double addrspace(1)* %out
   ret void
 }
@@ -564,7 +564,7 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
   store double -0.0, double addrspace(1)* %out
   ret void
 }
@@ -573,7 +573,7 @@ define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
   store double 0.5, double addrspace(1)* %out
   ret void
 }
@@ -582,7 +582,7 @@ define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
   store double -0.5, double addrspace(1)* %out
   ret void
 }
@@ -591,7 +591,7 @@ define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
   store double 1.0, double addrspace(1)* %out
   ret void
 }
@@ -600,7 +600,7 @@ define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
   store double -1.0, double addrspace(1)* %out
   ret void
 }
@@ -609,7 +609,7 @@ define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
   store double 2.0, double addrspace(1)* %out
   ret void
 }
@@ -618,7 +618,7 @@ define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
   store double -2.0, double addrspace(1)* %out
   ret void
 }
@@ -627,7 +627,7 @@ define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
   store double 4.0, double addrspace(1)* %out
   ret void
 }
@@ -636,7 +636,7 @@ define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
   store double -4.0, double addrspace(1)* %out
   ret void
 }
@@ -645,7 +645,7 @@ define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) {
   store double 0x3fc45f306dc9c882, double addrspace(1)* %out
   ret void
 }
@@ -654,7 +654,7 @@ define void @store_inv_2pi_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
   store double 0xbfc45f306dc9c882, double addrspace(1)* %out
   ret void
 }
@@ -663,7 +663,22 @@ define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) {
   store double 4096.0, double addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}literal_folding:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}}
+define amdgpu_vs void @literal_folding(float %arg) {
+main_body:
+  %tmp = fmul float %arg, 0x3FE86A7F00000000
+  %tmp1 = fmul float %arg, 0xBFE86A7F00000000
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp, float %tmp, float %tmp1, float %tmp1, i1 true, i1 false) #0
+  ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/imm16.ll b/test/CodeGen/AMDGPU/imm16.ll
index 2e73eb06502f..e42d58791890 100644
--- a/test/CodeGen/AMDGPU/imm16.ll
+++ b/test/CodeGen/AMDGPU/imm16.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
   store volatile i16 -32768, i16 addrspace(1)* %out
   ret void
 }
@@ -15,7 +15,7 @@ define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
   store half 0.0, half addrspace(1)* %out
   ret void
 }
@@ -24,7 +24,7 @@ define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
   store half -0.0, half addrspace(1)* %out
   ret void
 }
@@ -32,7 +32,7 @@ define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
   store half 0.5, half addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
   store half -0.5, half addrspace(1)* %out
   ret void
 }
@@ -49,7 +49,7 @@ define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
   store half 1.0, half addrspace(1)* %out
   ret void
 }
@@ -58,7 +58,7 @@ define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
   store half -1.0, half addrspace(1)* %out
   ret void
 }
@@ -66,7 +66,7 @@ define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
   store half 2.0, half addrspace(1)* %out
   ret void
 }
@@ -75,7 +75,7 @@ define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
   store half -2.0, half addrspace(1)* %out
   ret void
 }
@@ -83,7 +83,7 @@ define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
   store half 4.0, half addrspace(1)* %out
   ret void
 }
@@ -92,7 +92,7 @@ define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
   store half -4.0, half addrspace(1)* %out
   ret void
 }
@@ -101,7 +101,7 @@ define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
   store half 0xH3118, half addrspace(1)* %out
   ret void
 }
@@ -110,7 +110,7 @@ define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
   store half 0xHB118, half addrspace(1)* %out
   ret void
 }
@@ -118,7 +118,7 @@ define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_literal_imm_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00
 ; GCN: buffer_store_short [[REG]]
-define void @store_literal_imm_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) {
   store half 4096.0, half addrspace(1)* %out
   ret void
 }
@@ -127,7 +127,7 @@ define void @store_literal_imm_f16(half addrspace(1)* %out) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -137,7 +137,7 @@ define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0.5
   store half %y, half addrspace(1)* %out
   ret void
@@ -147,7 +147,7 @@ define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -0.5
   store half %y, half addrspace(1)* %out
   ret void
@@ -157,7 +157,7 @@ define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 1.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -167,7 +167,7 @@ define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -1.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -177,7 +177,7 @@ define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 2.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@ define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -2.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -197,7 +197,7 @@ define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 4.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -207,7 +207,7 @@ define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -4.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -217,7 +217,7 @@ define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %x = load half, half addrspace(1)* %in
   %y = fadd half %x, 0.5
   store half %y, half addrspace(1)* %out
@@ -228,7 +228,7 @@ define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrsp
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %x = load half, half addrspace(1)* %in
   %y = fadd half %x, 1024.0
   store half %y, half addrspace(1)* %out
@@ -239,7 +239,7 @@ define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)*
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0001
   store half %y, half addrspace(1)* %out
   ret void
@@ -249,7 +249,7 @@ define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0002
   store half %y, half addrspace(1)* %out
   ret void
@@ -259,7 +259,7 @@ define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0010
   store half %y, half addrspace(1)* %out
   ret void
@@ -269,7 +269,7 @@ define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFFF
   store half %y, half addrspace(1)* %out
   ret void
@@ -279,7 +279,7 @@ define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFFE
   store half %y, half addrspace(1)* %out
   ret void
@@ -289,7 +289,7 @@ define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFF0
   store half %y, half addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@ define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH003F
   store half %y, half addrspace(1)* %out
   ret void
@@ -309,7 +309,7 @@ define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0040
   store half %y, half addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll
new file mode 100644
index 000000000000..85ad365d02a8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/immv216.ll
@@ -0,0 +1,446 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; FIXME: Merge into imm.ll
+
+; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
+  store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_literal_imm_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0.0, half 0.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0.5, half 0.5>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half -0.5, half -0.5>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 1.0, half 1.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half -1.0, half -1.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 2.0, half 2.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half -2.0, half -2.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 4.0, half 4.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half -4.0, half -4.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16:
+; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_dword
+; VI-NOT: and
+; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16,
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %x = load <2 x half>, <2 x half> addrspace(1)* %in
+  %y = fadd <2 x half> %x, <half 0.5, half 0.5>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_add_literal_v2f16:
+; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x64006400
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]]
+; GFX9: buffer_store_dword [[REG]]
+
+; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
+; VI-DAG: buffer_load_dword
+; VI-NOT: and
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: buffer_store_dword
+define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %x = load <2 x half>, <2 x half> addrspace(1)* %in
+  %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_1_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_2_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_16_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xHFFFF, half 0xHFFFF>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xHFFFE, half 0xHFFFE>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -16{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xHFFF0, half 0xHFFF0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_63_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_64_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
index 877956be3088..8e207a38c847 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
@@ -10,7 +10,7 @@
 ; CHECK: s_mov_b32 m0, [[IN]]
 ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
 ; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
   store <4 x float> %ins, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 208e55c143ac..b18ae353ca4c 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
 
 ; Tests for indirect addressing on SI, which is implemented using dynamic
 ; indexing of vectors.
@@ -18,7 +19,7 @@
 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %idx = add i32 %in, 1
   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %idx
@@ -43,7 +44,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
+define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
 entry:
   %idx = add i32 %in, 1
   %vec = or <4 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4>
@@ -65,7 +66,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
   store float %elt, float addrspace(1)* %out
@@ -83,7 +84,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
@@ -104,7 +105,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %or = or <4 x i32> %vec0, %vec1
@@ -136,7 +137,7 @@ entry:
 
 ; IDXMODE: s_set_gpr_idx_off
 ; GCN: buffer_store_dword [[RESULT]]
-define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
@@ -146,7 +147,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
-define void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %value = extractelement <4 x i32> %ld, i32 undef
@@ -158,7 +159,7 @@ entry:
 ; GCN-DAG: buffer_load_dwordx4
 ; MOVREL-DAG: s_mov_b32 m0,
 ; MOVREL: v_movreld_b32
-define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %ld = load <4 x i32>, <4  x i32> addrspace(1)* %in
   %value = insertelement <4 x i32> %ld, i32 5, i32 undef
@@ -177,7 +178,7 @@ entry:
 
 ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
 ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
-define void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 %in, 1
   %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
@@ -196,7 +197,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
   store <4 x float> %0, <4 x float> addrspace(1)* %out
@@ -212,7 +213,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
@@ -232,7 +233,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <4 x i32> %vec, i32 5, i32 %index
@@ -269,7 +270,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword
-define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
@@ -304,7 +305,7 @@ entry:
 ; GCN: s_cbranch_execnz
 
 ; IDXMODE: s_set_gpr_idx_off
-define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -16
@@ -374,7 +375,7 @@ entry:
 
 ; GCN: buffer_store_dword [[MOVREL0]]
 ; GCN: buffer_store_dword [[MOVREL1]]
-define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
@@ -449,7 +450,7 @@ bb2:
 ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
 
 ; GCN: buffer_store_dword [[INS0]]
-define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
@@ -498,7 +499,7 @@ bb2:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @extract_adjacent_blocks(i32 %arg) #0 {
+define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -548,7 +549,7 @@ bb7:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -609,7 +610,7 @@ bb7:                                              ; preds = %bb4, %bb1
 ; GCN: ds_write_b32
 ; GCN: ds_write_b32
 ; GCN: s_endpgm
-define void @multi_same_block(i32 %arg) #0 {
+define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
 bb:
   %tmp1 = add i32 %arg, -16
   %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 4.000000e+00, i32 %tmp1
@@ -636,7 +637,7 @@ bb:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %offset = add i32 %idx, 3
@@ -657,7 +658,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %offset = add i32 %idx, 4
@@ -680,7 +681,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
+define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %idx.shl = shl i32 %idx.in, 2
@@ -701,7 +702,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
   %idx.shl = shl i32 %idx.in, 2
   %idx = or i32 %idx.shl, 1
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
@@ -728,7 +729,7 @@ define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x f
 ; IDXMODE: s_set_gpr_idx_idx
 ; IDXMODE: v_mov_b32_e32
 ; GCN: s_cbranch_execnz [[REGLOOP]]
-define void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
 bb:
   br label %bb2
 
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll
index 4db87c3c1b64..7f08a89d149e 100644
--- a/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -20,10 +20,10 @@ declare void @llvm.amdgcn.s.barrier() #0
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load double, double addrspace(1)* %in, align 8
-  %array = alloca [16 x double], align 8
-  %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b
+  %array = alloca [8 x double], align 8
+  %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b
   store double %val, double* %ptr, align 8
   call void @llvm.amdgcn.s.barrier()
   %result = load double, double* %ptr, align 8
@@ -51,10 +51,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write2_b64
 ; CI-PROMOTE: ds_read2_b64
-define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
-  %array = alloca [8 x <2 x double>], align 16
-  %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b
+  %array = alloca [4 x <2 x double>], align 16
+  %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b
   store <2 x double> %val, <2 x double>* %ptr, align 16
   call void @llvm.amdgcn.s.barrier()
   %result = load <2 x double>, <2 x double>* %ptr, align 16
@@ -77,7 +77,7 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %array = alloca [8 x i64], align 8
   %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b
@@ -109,10 +109,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write2_b64
 ; CI-PROMOTE: ds_read2_b64
-define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
-  %array = alloca [8 x <2 x i64>], align 16
-  %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b
+  %array = alloca [4 x <2 x i64>], align 16
+  %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b
   store <2 x i64> %val, <2 x i64>* %ptr, align 16
   call void @llvm.amdgcn.s.barrier()
   %result = load <2 x i64>, <2 x i64>* %ptr, align 16
@@ -121,4 +121,4 @@ define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <
 }
 
 attributes #0 = { convergent nounwind }
-attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="64,64" }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="64,128" }
diff --git a/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
index 990f33518ab9..7cee8a41c120 100644
--- a/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
+++ b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
@@ -2,7 +2,7 @@
 ; REQUIRES: asserts
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
-define void @inf_loop_irreducible_cfg() nounwind {
+define amdgpu_kernel void @inf_loop_irreducible_cfg() nounwind {
 entry:
   br label %block
 
diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e0b695934c7..73482756b8c8 100644
--- a/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_store_dword [[REG]]
 ; SI: s_waitcnt vmcnt(0) expcnt(0)
 ; SI: s_branch BB0_1
-define void @infinite_loop(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
 entry:
   br label %for.body
 
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index db1a0c67436d..0d7e07b9a624 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}inline_asm:
 ; CHECK: s_endpgm
 ; CHECK: s_endpgm
-define void @inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @inline_asm(i32 addrspace(1)* %out) {
 entry:
   store i32 5, i32 addrspace(1)* %out
   call void asm sideeffect "s_endpgm", ""()
@@ -25,7 +25,7 @@ entry:
 ; Make sure inline assembly is treted as divergent.
 ; CHECK: s_mov_b32 s{{[0-9]+}}, 0
 ; CHECK: s_and_saveexec_b64
-define void @branch_on_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) {
 	%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
 	%cmp = icmp eq i32 %zero, 0
 	br i1 %cmp, label %if, label %endif
@@ -44,7 +44,7 @@ endif:
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
   %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in)
   store i64 %sgpr, i64 addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@ define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm:
 ; CHECK: codeLenInByte = 12
-define void @code_size_inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e64", ""()
   ret void
@@ -61,7 +61,7 @@ entry:
 ; All inlineasm instructions are assumed to be the maximum size
 ; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst:
 ; CHECK: codeLenInByte = 12
-define void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e32", ""()
   ret void
@@ -69,7 +69,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -80,7 +80,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -92,7 +92,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "", ""()
   ret void
@@ -100,7 +100,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment", ""()
   ret void
@@ -108,7 +108,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
 ; comment", ""()
@@ -117,7 +117,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
 ", ""()
@@ -126,7 +126,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; first comment ; second comment", ""()
   ret void
@@ -134,7 +134,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; first comment;second comment", ""()
   ret void
@@ -142,7 +142,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
     v_nop_e64 ; inline comment
@@ -157,7 +157,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e64 ; inline comment
 ; separate comment
@@ -171,7 +171,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
     v_add_i32_e32 v0, vcc, v1, v2 ; inline comment
@@ -183,3 +183,52 @@ entry:
   ", ""()
   ret void
 }
+
+; FIXME: Should not have intermediate sgprs
+; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
+; CHECK: s_mov_b32 s1, 0
+; CHECK: s_mov_b32 s0, 0x1e240
+; CHECK: v_mov_b32_e32 v0, s0
+; CHECK: v_mov_b32_e32 v1, s1
+; CHECK: use v[0:1]
+define void @i64_imm_input_phys_vgpr() {
+entry:
+  call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
+; CHECK: v_mov_b32_e32 v0, -1{{$}}
+; CHECK: ; use v0
+define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
+entry:
+  call void asm sideeffect "; use $0 ", "{VGPR0}"(i1 true)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}i1_input_phys_vgpr:
+; CHECK: {{buffer|flat}}_load_ubyte [[LOAD:v[0-9]+]]
+; CHECK: v_and_b32_e32 [[LOAD]], 1, [[LOAD]]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK: ; use v0
+define amdgpu_kernel void @i1_input_phys_vgpr() {
+entry:
+  %val = load i1, i1 addrspace(1)* undef
+  call void asm sideeffect "; use $0 ", "{VGPR0}"(i1 %val)
+  ret void
+}
+
+; FIXME: Should be scheduled to shrink vcc
+; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
+; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK: v_cmp_eq_u32_e64 s[0:1], 1, v1
+; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+define amdgpu_kernel void @i1_input_phys_vgpr_x2() {
+entry:
+  %val0 = load volatile i1, i1 addrspace(1)* undef
+  %val1 = load volatile i1, i1 addrspace(1)* undef
+  call void asm sideeffect "; use $0 $1 ", "{VGPR0}, {VGPR1}"(i1 %val0, i1 %val1)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/inline-calls.ll b/test/CodeGen/AMDGPU/inline-calls.ll
index 4541a902c1b8..f8821f319893 100644
--- a/test/CodeGen/AMDGPU/inline-calls.ll
+++ b/test/CodeGen/AMDGPU/inline-calls.ll
@@ -11,7 +11,7 @@ entry:
 
 ; CHECK: {{^}}kernel:
 ; CHECK-NOT: call
-define void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @func(i32 1)
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; CHECK: {{^}}kernel2:
 ; CHECK-NOT: call
-define void @kernel2(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel2(i32 addrspace(1)* %out) {
 entry:
   call void @kernel(i32 addrspace(1)* %out)
   ret void
@@ -31,7 +31,7 @@ entry:
 
 ; CHECK: {{^}}kernel3:
 ; CHECK-NOT: call
-define void @kernel3(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel3(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @func_alias(i32 1)
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -43,7 +43,7 @@ entry:
 
 ; CHECK: {{^}}kernel4:
 ; CHECK-NOT: call
-define void @kernel4(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel4(i32 addrspace(1)* %out) {
 entry:
   call void @kernel_alias(i32 addrspace(1)* %out)
   ret void
diff --git a/test/CodeGen/AMDGPU/inline-constraints.ll b/test/CodeGen/AMDGPU/inline-constraints.ll
index 1bcbd14009ce..941a1b90dcc1 100644
--- a/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -10,7 +10,7 @@
 ; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 ; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 
-define void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
+define amdgpu_kernel void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
 entry:
   %v32 = tail call i32 asm sideeffect "flat_load_dword   $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
   %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
@@ -27,7 +27,7 @@ entry:
 ; GCN: s_mov_b32 m0, -1
 ; GCN: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; GCN: ; use [[COPY_M0]]
-define void @inline_sreg_constraint_m0() {
+define amdgpu_kernel void @inline_sreg_constraint_m0() {
   %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"()
   tail call void asm sideeffect "; use $0", "s"(i32 %m0)
   ret void
@@ -36,7 +36,7 @@ define void @inline_sreg_constraint_m0() {
 ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i32:
 ; GCN: s_mov_b32 [[REG:s[0-9]+]], 32
 ; GCN: ; use [[REG]]
-define void @inline_sreg_constraint_imm_i32() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_i32() {
   tail call void asm sideeffect "; use $0", "s"(i32 32)
   ret void
 }
@@ -44,7 +44,7 @@ define void @inline_sreg_constraint_imm_i32() {
 ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f32:
 ; GCN: s_mov_b32 [[REG:s[0-9]+]], 1.0
 ; GCN: ; use [[REG]]
-define void @inline_sreg_constraint_imm_f32() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_f32() {
   tail call void asm sideeffect "; use $0", "s"(float 1.0)
   ret void
 }
@@ -54,7 +54,7 @@ define void @inline_sreg_constraint_imm_f32() {
 ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
 ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
 ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
-define void @inline_sreg_constraint_imm_i64() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_i64() {
   tail call void asm sideeffect "; use $0", "s"(i64 -4)
   ret void
 }
@@ -63,7 +63,7 @@ define void @inline_sreg_constraint_imm_i64() {
 ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
 ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
-define void @inline_sreg_constraint_imm_f64() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {
   tail call void asm sideeffect "; use $0", "s"(double 1.0)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/inlineasm-16.ll b/test/CodeGen/AMDGPU/inlineasm-16.ll
index 75f3158937dc..15e57fe6bffb 100644
--- a/test/CodeGen/AMDGPU/inlineasm-16.ll
+++ b/test/CodeGen/AMDGPU/inlineasm-16.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}s_input_output_i16:
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i16() #0 {
+define amdgpu_kernel void @s_input_output_i16() #0 {
   %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i16 %v) #0
   ret void
@@ -14,7 +14,7 @@ define void @s_input_output_i16() #0 {
 ; GCN-LABEL: {{^}}v_input_output_i16:
 ; SICI: error: couldn't allocate output register for constraint 'v'
 ; SICI: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_i16() #0 {
+define amdgpu_kernel void @v_input_output_i16() #0 {
   %v = tail call i16 asm sideeffect "v_mov_b32 $0, -1", "=v"() #0
   tail call void asm sideeffect "; use $0", "v"(i16 %v)
   ret void
@@ -23,7 +23,7 @@ define void @v_input_output_i16() #0 {
 ; GCN-LABEL: {{^}}s_input_output_f16:
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_f16() #0 {
+define amdgpu_kernel void @s_input_output_f16() #0 {
   %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"() #0
   tail call void asm sideeffect "; use $0", "s"(half %v)
   ret void
@@ -32,7 +32,7 @@ define void @s_input_output_f16() #0 {
 ; GCN-LABEL: {{^}}v_input_output_f16:
 ; SICI: error: couldn't allocate output register for constraint 'v'
 ; SICI: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_f16() #0 {
+define amdgpu_kernel void @v_input_output_f16() #0 {
   %v = tail call half asm sideeffect "v_mov_b32 $0, -1", "=v"() #0
   tail call void asm sideeffect "; use $0", "v"(half %v)
   ret void
diff --git a/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
index 2eb21f07e0ec..c1d67ba614c6 100644
--- a/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
@@ -3,7 +3,7 @@
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i8() {
+define amdgpu_kernel void @s_input_output_i8() {
   %v = tail call i8 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i8 %v)
   ret void
@@ -11,7 +11,7 @@ define void @s_input_output_i8() {
 
 ; GCN: error: couldn't allocate output register for constraint 'v'
 ; GCN: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_i8() {
+define amdgpu_kernel void @v_input_output_i8() {
   %v = tail call i8 asm sideeffect "v_mov_b32 $0, -1", "=v"()
   tail call void asm sideeffect "; use $0", "v"(i8 %v)
   ret void
@@ -19,7 +19,7 @@ define void @v_input_output_i8() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i128() {
+define amdgpu_kernel void @s_input_output_i128() {
   %v = tail call i128 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i128 %v)
   ret void
@@ -27,7 +27,7 @@ define void @s_input_output_i128() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v8f16() {
+define amdgpu_kernel void @s_input_output_v8f16() {
   %v = tail call <8 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<8 x half> %v)
   ret void
@@ -36,7 +36,7 @@ define void @s_input_output_v8f16() {
 ; CI: error: couldn't allocate output register for constraint 's'
 ; CI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
-define void @s_input_output_f16() {
+define amdgpu_kernel void @s_input_output_f16() {
   %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(half %v)
   ret void
@@ -44,7 +44,7 @@ define void @s_input_output_f16() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v2f16() {
+define amdgpu_kernel void @s_input_output_v2f16() {
   %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<2 x half> %v)
   ret void
@@ -52,7 +52,7 @@ define void @s_input_output_v2f16() {
 
 ; GCN: error: couldn't allocate output register for constraint 'v'
 ; GCN: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_v2f16() {
+define amdgpu_kernel void @v_input_output_v2f16() {
   %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"()
   tail call void asm sideeffect "; use $0", "v"(<2 x half> %v)
   ret void
@@ -61,7 +61,7 @@ define void @v_input_output_v2f16() {
 ; CI: error: couldn't allocate output register for constraint 's'
 ; CI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
-define void @s_input_output_i16() {
+define amdgpu_kernel void @s_input_output_i16() {
   %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i16 %v)
   ret void
@@ -69,14 +69,14 @@ define void @s_input_output_i16() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v2i16() {
+define amdgpu_kernel void @s_input_output_v2i16() {
   %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v)
   ret void
 }
 
 ; FIXME: Crash in codegen prepare
-; define void @s_input_output_i3() {
+; define amdgpu_kernel void @s_input_output_i3() {
 ;   %v = tail call i3 asm sideeffect "s_mov_b32 $0, -1", "=s"()
 ;   tail call void asm sideeffect "; use $0", "s"(i3 %v)
 ;   ret void
diff --git a/test/CodeGen/AMDGPU/inlineasm-packed.ll b/test/CodeGen/AMDGPU/inlineasm-packed.ll
new file mode 100644
index 000000000000..3c6c7e1d1b42
--- /dev/null
+++ b/test/CodeGen/AMDGPU/inlineasm-packed.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}inline_asm_input_v2i16:
+; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 {
+entry:
+  %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x i16> %in) #0
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_input_v2f16:
+; GCN: s_mov_b32 s0, s{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 {
+entry:
+  %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_output_v2i16:
+; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %val = call <2 x i16> asm "s_mov_b32 $0, $1", "=r,r"(i32 %in) #0
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_output_v2f16:
+; GCN: v_mov_b32 v{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %val = call <2 x half> asm "v_mov_b32 $0, $1", "=v,r"(i32 %in) #0
+  store <2 x half> %val, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_packed_v2i16:
+; GCN: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 {
+entry:
+  %val = call <2 x i16> asm "v_pk_add_u16 $0, $1, $2", "=v,r,v"(<2 x i16> %in0, <2 x i16> %in1) #0
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_packed_v2f16:
+; GCN: v_pk_add_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 {
+entry:
+  %val = call <2 x half> asm "v_pk_add_f16 $0, $1, $2", "=v,r,v"(<2 x half> %in0, <2 x half> %in1) #0
+  store <2 x half> %val, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir b/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir
new file mode 100644
index 000000000000..bd5f296affb5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir
@@ -0,0 +1,40 @@
+# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold=1 %s -o - | FileCheck %s
+# https://bugs.freedesktop.org/show_bug.cgi?id=99019
+--- |
+  define amdgpu_ps void @kill_uncond_branch() {
+    ret void
+  }
+...
+---
+
+# CHECK-LABEL: name: kill_uncond_branch
+
+# CHECK: bb.0:
+# CHECK: S_CBRANCH_VCCNZ %bb.1, implicit %vcc
+
+# CHECK: bb.1:
+# CHECK: V_CMPX_LE_F32_e32
+# CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit %exec
+
+# CHECK: bb.3:
+# CHECK-NEXT: EXP_DONE
+# CHECK: S_ENDPGM
+
+# CHECK: bb.2:
+# CHECK: S_ENDPGM
+
+name: kill_uncond_branch
+
+body: |
+  bb.0:
+    successors: %bb.1
+    S_CBRANCH_VCCNZ %bb.1, implicit %vcc
+
+  bb.1:
+    successors: %bb.2
+    %vgpr0 = V_MOV_B32_e32 0, implicit %exec
+    SI_KILL_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/insert-waits-callee.mir b/test/CodeGen/AMDGPU/insert-waits-callee.mir
new file mode 100644
index 000000000000..ad7cd0cc8abf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert-waits-callee.mir
@@ -0,0 +1,25 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
+--- |
+  define float @entry_callee_wait(float %arg) #0 {
+    ret float %arg
+  }
+
+  attributes #0 = { nounwind }
+...
+---
+# CHECK-LABEL: name: entry_callee_wait{{$}}
+# CHECK: bb.0:
+# CHECK-NEXT: S_WAITCNT 0{{$}}
+# CHECK-NEXT: V_ADD_F32
+# CHECK-NEXT: S_SETPC_B64
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+  - { reg: '%vgpr0' }
+
+name: entry_callee_wait
+body:             |
+  bb.0:
+    %vgpr0 = V_ADD_F32_e32 %vgpr0, %vgpr0, implicit %exec
+    S_SETPC_B64 killed %sgpr0_sgpr1
+
+...
diff --git a/test/CodeGen/AMDGPU/insert-waits-exp.mir b/test/CodeGen/AMDGPU/insert-waits-exp.mir
index 9aaa374ed28e..1055201ce3dd 100644
--- a/test/CodeGen/AMDGPU/insert-waits-exp.mir
+++ b/test/CodeGen/AMDGPU/insert-waits-exp.mir
@@ -1,18 +1,18 @@
 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
 --- |
-  define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+  define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x
+  i32> inreg, i32 inreg %w, float %v) #0 {
     %a = load volatile float, float addrspace(1)* undef
     %b = load volatile float, float addrspace(1)* undef
     %c = load volatile float, float addrspace(1)* undef
     %d = load volatile float, float addrspace(1)* undef
-    call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %a, float %b, float %c, float %d)
+    call void @llvm.amdgcn.exp.f32(i32 15, i32 1, float %a, float %b, float %c, float %d, i1 true, i1 false)
     ret <4 x float> <float 5.000000e-01, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>
   }
 
-  declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+  declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
-  attributes #0 = { readnone }
-  attributes #1 = { nounwind }
+  attributes #0 = { nounwind }
 
 ...
 ---
@@ -58,6 +58,6 @@ body:             |
     %vgpr1 = V_MOV_B32_e32 1065353216, implicit %exec
     %vgpr2 = V_MOV_B32_e32 1073741824, implicit %exec
     %vgpr3 = V_MOV_B32_e32 1082130432, implicit %exec
-    SI_RETURN killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3
+    SI_RETURN_TO_EPILOG killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3
 
 ...
diff --git a/test/CodeGen/AMDGPU/insert_subreg.ll b/test/CodeGen/AMDGPU/insert_subreg.ll
index 4a5e8869c2df..e895f27c886d 100644
--- a/test/CodeGen/AMDGPU/insert_subreg.ll
+++ b/test/CodeGen/AMDGPU/insert_subreg.ll
@@ -6,7 +6,7 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: test:
-define void @test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i64 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca [16 x i32]
   %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 65ac693a4f44..6391b6b5407b 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-NO-TONGA %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-TONGA %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
@@ -18,56 +18,56 @@
 ; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000
 ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
-define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_1:
-define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_2:
-define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_3:
-define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4i32_0:
-define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_1:
-define void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_2:
-define void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_3:
-define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
@@ -78,7 +78,7 @@ define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %
 define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
   %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef
   %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
-  %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 false, i1 false, i1 false, i1 false, i1 true)
   ret <4 x float> %tmp2
 }
 
@@ -86,7 +86,7 @@ define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -97,7 +97,7 @@ define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x fl
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 ; GCN-DAG: buffer_store_dword v
-define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
@@ -107,7 +107,7 @@ define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x fl
 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -117,7 +117,7 @@ define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x fl
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -129,7 +129,7 @@ define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x fl
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -138,7 +138,7 @@ define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
 ; GCN: v_movreld_b32
 ; GCN: buffer_store_dwordx2
-define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -148,7 +148,7 @@ define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5
 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 ; GCN-DAG: buffer_store_dword v
-define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
   store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
   ret void
@@ -159,7 +159,7 @@ define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32>
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -169,7 +169,7 @@ define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32>
 ; GCN: v_movreld_b32
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
   ret void
@@ -181,21 +181,21 @@ define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32>
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i16:
-define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:
-define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
   store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
   ret void
@@ -207,25 +207,22 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16>
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 
-; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 8{{$}}
 ; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}}
 ; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}}
 
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:14
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:10
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8
 ; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: s_waitcnt
 
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: buffer_load_dwordx2
 
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
-define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
   store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8
   ret void
@@ -235,16 +232,17 @@ define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
 
 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-TONGA: buffer_load_ushort
 
 ; GCN: buffer_store_short v{{[0-9]+}}, off
-define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
   ret void
@@ -255,19 +253,19 @@ define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
 
-; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-TONGA: buffer_load_ushort
+; GCN-TONGA: buffer_load_ubyte
 
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
-define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
   store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -279,34 +277,35 @@ define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:3
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:7
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
 
 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-TONGA: buffer_load_dword
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, off
-define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
-define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
   store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
-define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
   ret void
@@ -315,7 +314,7 @@ define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8>
 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 ; the compiler doesn't crash.
 ; GCN-LABEL: {{^}}insert_split_bb:
-define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
   %1 = icmp eq i32 %a, 0
@@ -362,7 +361,7 @@ endif:
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -375,14 +374,14 @@ define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x d
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
   store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
-define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
   store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
   ret void
@@ -396,15 +395,15 @@ define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64>
 
 ; Stack store
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
 
 ; Write element
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
 ; Stack reload
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
 
 ; Store result
 ; GCN: buffer_store_dwordx4
@@ -412,7 +411,7 @@ define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64>
 ; GCN: s_endpgm
 ; GCN: ScratchSize: 64
 
-define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
   store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -421,17 +420,17 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
 ; GCN-DAG: SCRATCH_RSRC_DWORD
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
-; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
 
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
@@ -439,10 +438,13 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
 ; GCN: ScratchSize: 128
-define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
   ret void
 }
 
-declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
+declare <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
new file mode 100644
index 000000000000..a3f82b8a0117
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -0,0 +1,470 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+
+; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
+
+; GFX9-NOT: lshr
+; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]]
+define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg:
+; GCN: s_load_dword [[ELT0:s[0-9]+]]
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+
+; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
+; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+
+; GFX9-NOT: [[ELT0]]
+; GFX9-NOT: [[VEC]]
+; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
+; GCN: s_load_dword [[ELT0:s[0-9]+]]
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+
+; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
+; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
+; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
+; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+; CIVI-DAG: ; use [[SHR]]
+
+; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
+; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+; GFX9-DAG: ; use [[ELT1]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %elt1 = extractelement <2 x i16> %vec, i32 1
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  %use1 = zext i16 %elt1 to i32
+  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+
+; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]]
+
+; GFX9-NOT: [[ELT0]]
+; GFX9-NOT: [[VEC]]
+; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %elt.hi = lshr i32 %elt.arg, 16
+  %elt = trunc i32 %elt.hi to i16
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[VEC:s[0-9]+]],
+
+; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+
+; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
+; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]]
+; GFX9: ; use [[ELT1]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %elt.hi = lshr i32 %elt.arg, 16
+  %elt = trunc i32 %elt.hi to i16
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  %use1 = zext i16 %elt to i32
+  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[VEC:s[0-9]+]],
+
+; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
+; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
+; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
+
+; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
+; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
+; GFX9: ; use [[ELT_HI]]
+; GFX9: ; use [[VEC_HI]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %elt.hi = lshr i32 %elt.arg, 16
+  %elt = trunc i32 %elt.hi to i16
+  %vec.hi = extractelement <2 x i16> %vec, i32 1
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  %use1 = zext i16 %elt to i32
+  %vec.hi.use1 = zext i16 %vec.hi to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
+  call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_1:
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+
+; GCN-NOT: s_lshr
+
+; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
+; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000
+
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7
+define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg:
+; GCN: s_load_dword [[ELT1:s[0-9]+]]
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+
+; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+
+; GCN-NOT: shlr
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
+define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2f16_0:
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC:s[0-9]+]], 0xffff0000
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x4500
+
+; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]]
+define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
+  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2f16_1:
+; GFX9: s_load_dword [[VEC:s[0-9]+]]
+; GCN-NOT: s_lshr
+
+; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
+; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000
+
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500
+define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+  %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
+  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
+  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_0:
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
+; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
+; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e7, [[ELT1]]
+
+; GFX9-DAG: s_movk_i32 [[ELT0:s[0-9]+]], 0x3e7{{$}}
+; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
+; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
+define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_0_reghi:
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: s_load_dword [[ELT0:s[0-9]+]]
+
+; CIVI-DAG: s_lshr_b32 [[ELT0_SHIFT:s[0-9]+]], [[ELT0]], 16
+; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
+; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]]
+
+; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
+; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]]
+; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]]
+
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
+define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %elt.hi = lshr i32 %elt.arg, 16
+  %elt = trunc i32 %elt.hi to i16
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_0_inlineimm:
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
+
+; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
+; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]]
+
+; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
+; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]]
+
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
+define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
+; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
+
+; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
+
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
+define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_1_inlineimm:
+; GCN: flat_load_dword [[VEC:v[0-9]+]]
+; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
+define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2f16_0:
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
+
+; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
+; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x4500, [[ELT1]]
+
+; GFX9-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0x4500{{$}}
+; GFX9-DAG: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]]
+
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
+define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
+  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2f16_0_inlineimm:
+; GCN: flat_load_dword [[VEC:v[0-9]+]]
+
+; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
+; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]]
+
+; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
+define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
+  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
+; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
+
+; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
+; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
+
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
+define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
+  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2f16_1_inlineimm:
+; GCN: flat_load_dword [[VEC:v[0-9]+]]
+; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
+define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
+  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: Enable for others when argument load not split
+; GCN-LABEL: {{^}}s_insertelement_v2i16_dynamic:
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+; GCN: s_load_dword [[IDX:s[0-9]+]]
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VVEC:v[0-9]+]], [[VEC]]
+; GCN-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
+; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
+  %idx = load volatile i32, i32 addrspace(2)* %idx.ptr
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_sgpr:
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+; GCN-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
+; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+; GCN: flat_load_dword [[IDX:v[0-9]+]]
+; GCN: flat_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %idx = load i32, i32 addrspace(1)* %idx.gep
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
+; GCN: flat_load_dword [[IDX:v[0-9]+]]
+; GCN: flat_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %idx = load i32, i32 addrspace(1)* %idx.gep
+  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
+  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir
index 85cd903a405d..1479303712d0 100644
--- a/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -1,14 +1,46 @@
 # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN
 # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN,CIVI
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9
 
 --- |
-  define void @div_fmas() { ret void }
-  define void @s_getreg() { ret void }
-  define void @s_setreg() { ret void }
-  define void @vmem_gt_8dw_store() { ret void }
-  define void @readwrite_lane() { ret void }
-  define void @rfe() { ret void }
+  define amdgpu_kernel void @div_fmas() { ret void }
+  define amdgpu_kernel void @s_getreg() { ret void }
+  define amdgpu_kernel void @s_setreg() { ret void }
+  define amdgpu_kernel void @vmem_gt_8dw_store() { ret void }
+  define amdgpu_kernel void @readwrite_lane() { ret void }
+  define amdgpu_kernel void @rfe() { ret void }
+  define amdgpu_kernel void @s_mov_fed_b32() { ret void }
+  define amdgpu_kernel void @s_movrel() { ret void }
+  define amdgpu_kernel void @v_interp() { ret void }
+
+  define amdgpu_kernel void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) {
+  entry:
+    %A.addr = alloca i32 addrspace(1)*, align 4
+    store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+    call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !5, metadata !11), !dbg !12
+    ret void
+  }
+
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 268929)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "test01.cl", directory: "/dev/null")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 2}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !DILocalVariable(name: "A", arg: 1, scope: !6, file: !1, line: 1, type: !9)
+  !6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+  !7 = !DISubroutineType(types: !8)
+  !8 = !{null, !9}
+  !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 32)
+  !10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !11 = !DIExpression()
+  !12 = !DILocation(line: 1, column: 30, scope: !6)
+
 ...
 ---
 # GCN-LABEL: name: div_fmas
@@ -331,3 +363,185 @@ body: |
     S_ENDPGM
 
 ...
+
+...
+---
+
+# GCN-LABEL: name: s_mov_fed_b32
+
+# GCN-LABEL: bb.0:
+# GCN: S_MOV_FED_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOV_B32
+
+# GCN-LABEL: bb.1:
+# GCN: S_MOV_FED_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_MOV_B32
+name: s_mov_fed_b32
+
+body: |
+  bb.0:
+    successors: %bb.1
+    %sgpr0 = S_MOV_FED_B32 %sgpr0
+    %sgpr0 = S_MOV_B32 %sgpr0
+    S_BRANCH %bb.1
+
+  bb.1:
+    %sgpr0 = S_MOV_FED_B32 %sgpr0
+    %vgpr0 = V_MOV_B32_e32 %sgpr0, implicit %exec
+    S_ENDPGM
+
+...
+
+...
+---
+
+# GCN-LABEL: name: s_movrel
+
+# GCN-LABEL: bb.0:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOVRELS_B32
+
+# GCN-LABEL: bb.1:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOVRELS_B64
+
+# GCN-LABEL: bb.2:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOVRELD_B32
+
+# GCN-LABEL: bb.3:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOVRELD_B64
+
+name: s_movrel
+
+body: |
+  bb.0:
+    successors: %bb.1
+    %m0 = S_MOV_B32 0
+    %sgpr0 = S_MOVRELS_B32 %sgpr0, implicit %m0
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    %m0 = S_MOV_B32 0
+    %sgpr0_sgpr1 = S_MOVRELS_B64 %sgpr0_sgpr1, implicit %m0
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    %m0 = S_MOV_B32 0
+    %sgpr0 = S_MOVRELD_B32 %sgpr0, implicit %m0
+    S_BRANCH %bb.3
+
+  bb.3:
+    %m0 = S_MOV_B32 0
+    %sgpr0_sgpr1 = S_MOVRELD_B64 %sgpr0_sgpr1, implicit %m0
+    S_ENDPGM
+...
+
+...
+---
+
+# GCN-LABEL: name: v_interp
+
+# GCN-LABEL: bb.0:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_INTERP_P1_F32
+
+# GCN-LABEL: bb.1:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_INTERP_P2_F32
+
+# GCN-LABEL: bb.2:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_INTERP_P1_F32_16bank
+
+# GCN-LABEL: bb.3:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_INTERP_MOV_F32
+
+name: v_interp
+
+body: |
+  bb.0:
+    successors: %bb.1
+    %m0 = S_MOV_B32 0
+    %vgpr0 = V_INTERP_P1_F32 %vgpr0, 0, 0, implicit %m0, implicit %exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    %m0 = S_MOV_B32 0
+    %vgpr0 = V_INTERP_P2_F32 %vgpr0, %vgpr1, 0, 0, implicit %m0, implicit %exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    %m0 = S_MOV_B32 0
+    %vgpr0 = V_INTERP_P1_F32_16bank %vgpr0, 0, 0, implicit %m0, implicit %exec
+    S_BRANCH %bb.3
+
+  bb.3:
+    %m0 = S_MOV_B32 0
+    %vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit %m0, implicit %exec
+    S_ENDPGM
+...
+---
+name:            mov_fed_hazard_crash_on_dbg_value
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+  - { reg: '%sgpr4_sgpr5' }
+  - { reg: '%sgpr6_sgpr7' }
+  - { reg: '%sgpr9' }
+  - { reg: '%sgpr0_sgpr1_sgpr2_sgpr3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       16
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+stack:
+  - { id: 0, name: A.addr, offset: 0, size: 8, alignment: 8, local-offset: 0 }
+  - { id: 1, offset: 8, size: 4, alignment: 4 }
+body:             |
+  bb.0.entry:
+    liveins: %sgpr4_sgpr5, %sgpr6_sgpr7, %sgpr9, %sgpr0_sgpr1_sgpr2_sgpr3
+
+    %flat_scr_lo = S_ADD_U32 %sgpr6, %sgpr9, implicit-def %scc
+    %flat_scr_hi = S_ADDC_U32 %sgpr7, 0, implicit-def %scc, implicit %scc
+    DBG_VALUE _, 2, !5, !11, debug-location !12
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    dead %sgpr6_sgpr7 = KILL %sgpr4_sgpr5
+    %sgpr8 = S_MOV_B32 %sgpr5
+    %vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr + 4)
+    %sgpr8 = S_MOV_B32 %sgpr4, implicit killed %sgpr4_sgpr5
+    %vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr)
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/internalize.ll b/test/CodeGen/AMDGPU/internalize.ll
new file mode 100644
index 000000000000..968b1d326a76
--- /dev/null
+++ b/test/CodeGen/AMDGPU/internalize.ll
@@ -0,0 +1,35 @@
+; RUN: opt -O1 -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-internalize-symbols < %s | FileCheck %s
+; CHECK-NOT: unused
+; CHECK-NOT: foo_used
+; CHECK: gvar_used
+; CHECK: main_kernel
+
+@gvar_unused = addrspace(1) global i32 undef, align 4
+@gvar_used = addrspace(1) global i32 undef, align 4
+
+; Function Attrs: alwaysinline nounwind
+define amdgpu_kernel void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 {
+entry:
+  store i32 1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define amdgpu_kernel void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 {
+entry:
+  store i32 %tid, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @main_kernel() {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  tail call void @foo_used(i32 addrspace(1)* @gvar_used, i32 %tid) nounwind
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+
+attributes #1 = { alwaysinline nounwind }
diff --git a/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index c29434f5eca2..31f2fbc919aa 100644
--- a/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
-define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
+define amdgpu_kernel void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %stof
   ret void
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll
deleted file mode 100644
index 49c314fbc5d0..000000000000
--- a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s
-; check llc does not crash for invalid opencl version metadata
-
-; CHECK: { amd.MDVersion: [ 2, 0 ] }
-
-!opencl.ocl.version = !{}
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll
deleted file mode 100644
index 1f5e8be531dc..000000000000
--- a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s
-; check llc does not crash for invalid opencl version metadata
-
-; CHECK: { amd.MDVersion: [ 2, 0 ] }
-
-!opencl.ocl.version = !{!0}
-!0 = !{}
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll
deleted file mode 100644
index b77551e268a0..000000000000
--- a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s
-; check llc does not crash for invalid opencl version metadata
-
-; CHECK: { amd.MDVersion: [ 2, 0 ] }
-
-!opencl.ocl.version = !{!0}
-!0 = !{i32 1}
diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 45a061067cfc..5cd965d2fa9c 100644
--- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -10,7 +10,7 @@
 ; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], [[PTR]]
-define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
   store i16 123, i16 addrspace(1)* %ptr, align 4
@@ -22,7 +22,7 @@ define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 add
 ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]:
-define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
   store i16 123, i16 addrspace(1)* %ptr, align 4
diff --git a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
index 66182d092895..bc1dafe0ea1e 100644
--- a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
+++ b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass block-placement -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s
 --- |
 
-  define void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
 
diff --git a/test/CodeGen/AMDGPU/kcache-fold.ll b/test/CodeGen/AMDGPU/kcache-fold.ll
index 43448fbd7b33..37dd977ae216 100644
--- a/test/CodeGen/AMDGPU/kcache-fold.ll
+++ b/test/CodeGen/AMDGPU/kcache-fold.ll
@@ -1,100 +1,112 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
 
 ; CHECK: {{^}}main1:
 ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
-define void @main1() {
+define amdgpu_kernel void @main1() #0 {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %3 = extractelement <4 x float> %2, i32 0
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = fcmp ogt float %1, 0.000000e+00
-  %7 = select i1 %6, float %3, float %5
-  %8 = load <4 x float>, <4 x float> addrspace(8)* null
-  %9 = extractelement <4 x float> %8, i32 1
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %11 = extractelement <4 x float> %10, i32 1
-  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ogt float %9, 0.000000e+00
-  %15 = select i1 %14, float %11, float %13
-  %16 = load <4 x float>, <4 x float> addrspace(8)* null
-  %17 = extractelement <4 x float> %16, i32 2
-  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %19 = extractelement <4 x float> %18, i32 2
-  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ogt float %17, 0.000000e+00
-  %23 = select i1 %22, float %19, float %21
-  %24 = load <4 x float>, <4 x float> addrspace(8)* null
-  %25 = extractelement <4 x float> %24, i32 3
-  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %27 = extractelement <4 x float> %26, i32 3
-  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %29 = extractelement <4 x float> %28, i32 3
-  %30 = fcmp ogt float %25, 0.000000e+00
-  %31 = select i1 %30, float %27, float %29
-  %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
-  %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00)
-  %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00)
-  %36 = insertelement <4 x float> undef, float %32, i32 0
-  %37 = insertelement <4 x float> %36, float %33, i32 1
-  %38 = insertelement <4 x float> %37, float %34, i32 2
-  %39 = insertelement <4 x float> %38, float %35, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  %tmp = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp7 = extractelement <4 x float> %tmp, i32 0
+  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp9 = extractelement <4 x float> %tmp8, i32 0
+  %tmp10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp11 = extractelement <4 x float> %tmp10, i32 0
+  %tmp12 = fcmp ogt float %tmp7, 0.000000e+00
+  %tmp13 = select i1 %tmp12, float %tmp9, float %tmp11
+  %tmp14 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp15 = extractelement <4 x float> %tmp14, i32 1
+  %tmp16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp17 = extractelement <4 x float> %tmp16, i32 1
+  %tmp18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp19 = extractelement <4 x float> %tmp18, i32 1
+  %tmp20 = fcmp ogt float %tmp15, 0.000000e+00
+  %tmp21 = select i1 %tmp20, float %tmp17, float %tmp19
+  %tmp22 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp23 = extractelement <4 x float> %tmp22, i32 2
+  %tmp24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp25 = extractelement <4 x float> %tmp24, i32 2
+  %tmp26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp27 = extractelement <4 x float> %tmp26, i32 2
+  %tmp28 = fcmp ogt float %tmp23, 0.000000e+00
+  %tmp29 = select i1 %tmp28, float %tmp25, float %tmp27
+  %tmp30 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp31 = extractelement <4 x float> %tmp30, i32 3
+  %tmp32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp33 = extractelement <4 x float> %tmp32, i32 3
+  %tmp34 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp35 = extractelement <4 x float> %tmp34, i32 3
+  %tmp36 = fcmp ogt float %tmp31, 0.000000e+00
+  %tmp37 = select i1 %tmp36, float %tmp33, float %tmp35
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp13, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp21, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp29, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp37, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp38 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp39 = insertelement <4 x float> %tmp38, float %clamp.i6, i32 1
+  %tmp40 = insertelement <4 x float> %tmp39, float %clamp.i4, i32 2
+  %tmp41 = insertelement <4 x float> %tmp40, float %clamp.i2, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp41, i32 0, i32 0)
   ret void
 }
 
 ; CHECK: {{^}}main2:
 ; CHECK-NOT: MOV
-define void @main2() {
+define amdgpu_kernel void @main2() #0 {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %3 = extractelement <4 x float> %2, i32 0
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %5 = extractelement <4 x float> %4, i32 1
-  %6 = fcmp ogt float %1, 0.000000e+00
-  %7 = select i1 %6, float %3, float %5
-  %8 = load <4 x float>, <4 x float> addrspace(8)* null
-  %9 = extractelement <4 x float> %8, i32 1
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %11 = extractelement <4 x float> %10, i32 0
-  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ogt float %9, 0.000000e+00
-  %15 = select i1 %14, float %11, float %13
-  %16 = load <4 x float>, <4 x float> addrspace(8)* null
-  %17 = extractelement <4 x float> %16, i32 2
-  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %19 = extractelement <4 x float> %18, i32 3
-  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ogt float %17, 0.000000e+00
-  %23 = select i1 %22, float %19, float %21
-  %24 = load <4 x float>, <4 x float> addrspace(8)* null
-  %25 = extractelement <4 x float> %24, i32 3
-  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %27 = extractelement <4 x float> %26, i32 3
-  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %29 = extractelement <4 x float> %28, i32 2
-  %30 = fcmp ogt float %25, 0.000000e+00
-  %31 = select i1 %30, float %27, float %29
-  %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
-  %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00)
-  %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00)
-  %36 = insertelement <4 x float> undef, float %32, i32 0
-  %37 = insertelement <4 x float> %36, float %33, i32 1
-  %38 = insertelement <4 x float> %37, float %34, i32 2
-  %39 = insertelement <4 x float> %38, float %35, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  %tmp = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp7 = extractelement <4 x float> %tmp, i32 0
+  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp9 = extractelement <4 x float> %tmp8, i32 0
+  %tmp10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp11 = extractelement <4 x float> %tmp10, i32 1
+  %tmp12 = fcmp ogt float %tmp7, 0.000000e+00
+  %tmp13 = select i1 %tmp12, float %tmp9, float %tmp11
+  %tmp14 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp15 = extractelement <4 x float> %tmp14, i32 1
+  %tmp16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp17 = extractelement <4 x float> %tmp16, i32 0
+  %tmp18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp19 = extractelement <4 x float> %tmp18, i32 1
+  %tmp20 = fcmp ogt float %tmp15, 0.000000e+00
+  %tmp21 = select i1 %tmp20, float %tmp17, float %tmp19
+  %tmp22 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp23 = extractelement <4 x float> %tmp22, i32 2
+  %tmp24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp25 = extractelement <4 x float> %tmp24, i32 3
+  %tmp26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp27 = extractelement <4 x float> %tmp26, i32 2
+  %tmp28 = fcmp ogt float %tmp23, 0.000000e+00
+  %tmp29 = select i1 %tmp28, float %tmp25, float %tmp27
+  %tmp30 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp31 = extractelement <4 x float> %tmp30, i32 3
+  %tmp32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp33 = extractelement <4 x float> %tmp32, i32 3
+  %tmp34 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp35 = extractelement <4 x float> %tmp34, i32 2
+  %tmp36 = fcmp ogt float %tmp31, 0.000000e+00
+  %tmp37 = select i1 %tmp36, float %tmp33, float %tmp35
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp13, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp21, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp29, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp37, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp38 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp39 = insertelement <4 x float> %tmp38, float %clamp.i6, i32 1
+  %tmp40 = insertelement <4 x float> %tmp39, float %clamp.i4, i32 2
+  %tmp41 = insertelement <4 x float> %tmp40, float %clamp.i2, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp41, i32 0, i32 0)
   ret void
 }
 
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) readnone
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #0
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
index 21c92dbc9098..8e358ef2804f 100644
--- a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
+++ b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
@@ -4,40 +4,40 @@
 ; alignment of the stack
 
 ; CHECK-LABEL: {{^}}no_args:
-; CHECK: ScratchSize: 8{{$}}
-define void @no_args() {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @no_args() {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
 }
 
 ; CHECK-LABEL: {{^}}force_align32:
-; CHECK: ScratchSize: 8{{$}}
-define void @force_align32(<8 x i32>) {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @force_align32(<8 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
 }
 
 ; CHECK-LABEL: {{^}}force_align64:
-; CHECK: ScratchSize: 8{{$}}
-define void @force_align64(<16 x i32>) {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @force_align64(<16 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
 }
 
 ; CHECK-LABEL: {{^}}force_align128:
-; CHECK: ScratchSize: 8{{$}}
-define void @force_align128(<32 x i32>) {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @force_align128(<32 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
 }
 
 ; CHECK-LABEL: {{^}}force_align256:
-; CHECK: ScratchSize: 8{{$}}
-define void @force_align256(<64 x i32>) {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @force_align256(<64 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 95a68319f8af..6fa26cb38793 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -17,7 +17,7 @@
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
   %0 = zext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -36,7 +36,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
   %0 = zext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -55,7 +55,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
   %0 = sext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -75,7 +75,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -94,7 +94,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -113,7 +113,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
   %0 = sext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -126,7 +126,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
-define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
   ret void
@@ -138,7 +138,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
-define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
   ret void
@@ -152,7 +152,7 @@ entry:
 ; MESA-GCN: buffer_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
   ret void
@@ -166,7 +166,7 @@ entry:
 ; MESA-GCN: buffer_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
   ret void
@@ -179,7 +179,7 @@ entry:
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
   ret void
@@ -192,7 +192,7 @@ entry:
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
-define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
   ret void
@@ -209,7 +209,7 @@ entry:
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -226,7 +226,7 @@ entry:
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
   ret void
@@ -253,7 +253,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
   ret void
@@ -273,7 +273,7 @@ entry:
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
   ret void
@@ -293,7 +293,7 @@ entry:
 ; HSA-GCN: flat_load_ushort
 ; HSA-GCN: flat_load_ushort
 ; HSA-GCN: flat_load_ushort
-define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
   ret void
@@ -308,7 +308,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -323,7 +323,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
   ret void
@@ -354,7 +354,7 @@ entry:
 ; HSA-GCN: float_load_ubyte
 ; HSA-GCN: float_load_ubyte
 ; HSA-GCN: float_load_ubyte
-define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
   ret void
@@ -386,7 +386,7 @@ entry:
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
   ret void
@@ -405,7 +405,7 @@ entry:
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
-define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
   ret void
@@ -422,7 +422,7 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
-define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
   ret void
@@ -478,7 +478,7 @@ entry:
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
   ret void
@@ -534,7 +534,7 @@ entry:
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
   ret void
@@ -561,7 +561,7 @@ entry:
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
   ret void
@@ -588,7 +588,7 @@ entry:
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
   ret void
@@ -599,7 +599,7 @@ entry:
 ; MESA-GCN: s_load_dwordx2
 ; MESA-GCN: buffer_store_dwordx2
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -611,7 +611,7 @@ define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
 ; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
 ; MESA-GCN: buffer_store_dwordx2
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
   ret void
@@ -621,7 +621,7 @@ entry:
 ; XGCN: s_load_dwordx2
 ; XGCN: s_load_dwordx2
 ; XGCN: buffer_store_dwordx2
-; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
 ;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
 ;   ret void
 ; }
@@ -631,7 +631,7 @@ entry:
 ; SI: v_and_b32_e32
 ; SI: buffer_store_byte
 ; SI: s_endpgm
-define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   store i1 %x, i1 addrspace(1)* %out, align 1
   ret void
 }
@@ -640,7 +640,7 @@ define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -650,7 +650,7 @@ define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
@@ -660,7 +660,7 @@ define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i32
   store i32 %ext, i32addrspace(1)* %out, align 4
   ret void
@@ -672,7 +672,7 @@ define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: v_ashrrev_i32
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 4f6dbf9dc2bf..4af37d8da966 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
 
@@ -14,6 +15,7 @@
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
 ; CI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe8f000
 ; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000
+; GFX9-DAG: s_mov_b32 s{{[0-9]+}}, 0xe00000
 
 
 ; GCNHSA: .amd_kernel_code_t
@@ -46,7 +48,7 @@
 
 ; Scratch size = alloca size + emergency stack slot
 ; ALL: ; ScratchSize: 32772
-define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
   %large = alloca [8192 x i32], align 4
   %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
   store volatile i32 %x, i32* %gep
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index ea9754a390b6..28b819a6374b 100644
--- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader:
 ; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -7,6 +8,7 @@
 ; GCN-DAG: s_mov_b32 s10, -1
 ; CI-DAG: s_mov_b32 s11, 0xe8f000
 ; VI-DAG: s_mov_b32 s11, 0xe80000
+; GFX9-DAG: s_mov_b32 s11, 0xe00000
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
@@ -28,6 +30,7 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
 ; GCN-DAG: s_mov_b32 s10, -1
 ; CI-DAG: s_mov_b32 s11, 0xe8f000
 ; VI-DAG: s_mov_b32 s11, 0xe80000
+; GFX9-DAG: s_mov_b32 s11, 0xe00000
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
diff --git a/test/CodeGen/AMDGPU/large-constant-initializer.ll b/test/CodeGen/AMDGPU/large-constant-initializer.ll
index 9975b1b7f5cc..c46d68e38ade 100644
--- a/test/CodeGen/AMDGPU/large-constant-initializer.ll
+++ b/test/CodeGen/AMDGPU/large-constant-initializer.ll
@@ -4,7 +4,7 @@
 
 @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
 
-define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
+define amdgpu_kernel void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
   %mul12 = mul nsw i32 %val, 7
   br i1 undef, label %exit, label %bb
diff --git a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
index 906a688febd2..13dd7058c50a 100644
--- a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -1,8 +1,10 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
+; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
 
-; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -22,9 +24,9 @@ entry:
   ret void
 }
 
-; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -44,9 +46,9 @@ entry:
   ret void
 }
 
-; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
+define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -66,9 +68,10 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_0(
-; CHECK: alloca [5 x i32]
-define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
+; ALL-LABEL: @occupancy_0(
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
+define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -88,9 +91,10 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_max(
-; CHECK: alloca [5 x i32]
-define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
+; ALL-LABEL: @occupancy_max(
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
+define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -110,9 +114,11 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_6(
-; CHECK-NOT: alloca
-define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+; SI-LABEL: @occupancy_6(
+; CI-LABEL: @occupancy_6(
+; SI: alloca
+; CI-NOT: alloca
+define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [42 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -134,9 +140,9 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_6_over(
-; CHECK: alloca [43 x i8]
-define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+; ALL-LABEL: @occupancy_6_over(
+; ALL: alloca [43 x i8]
+define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [43 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -158,9 +164,11 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_8(
-; CHECK-NOT: alloca
-define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+; SI-LABEL: @occupancy_8(
+; CI-LABEL: @occupancy_8(
+; SI: alloca
+; CI-NOT: alloca
+define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [32 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -182,9 +190,9 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_8_over(
-; CHECK: alloca [33 x i8]
-define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+; ALL-LABEL: @occupancy_8_over(
+; ALL: alloca [33 x i8]
+define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [33 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -206,9 +214,11 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_9(
-; CHECK-NOT: alloca
-define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+; SI-LABEL: @occupancy_9(
+; CI-LABEL: @occupancy_9(
+; SI: alloca
+; CI-NOT: alloca
+define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 entry:
   %stack = alloca [28 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -230,9 +240,9 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_9_over(
-; CHECK: alloca [29 x i8]
-define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+; ALL-LABEL: @occupancy_9_over(
+; ALL: alloca [29 x i8]
+define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 entry:
   %stack = alloca [29 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
diff --git a/test/CodeGen/AMDGPU/lds-alignment.ll b/test/CodeGen/AMDGPU/lds-alignment.ll
index 99334585e589..c23dea2b6b76 100644
--- a/test/CodeGen/AMDGPU/lds-alignment.ll
+++ b/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -15,7 +15,7 @@ declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace
 
 ; HSA-LABEL: {{^}}test_no_round_size_1:
 ; HSA: workgroup_group_segment_byte_size = 38
-define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
@@ -34,7 +34,7 @@ define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #
 ; HSA-LABEL: {{^}}test_round_size_2:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
@@ -50,7 +50,7 @@ define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -65,7 +65,7 @@ define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %
 ; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
 ; HSA: workgroup_group_segment_byte_size = 38
 ; HSA: group_segment_alignment = 4
-define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
 
@@ -78,7 +78,7 @@ define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)
 ; HSA-LABEL: {{^}}test_round_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
   ret void
@@ -88,7 +88,7 @@ define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8
 ; HSA-LABEL: {{^}}test_high_align_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
+define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false)
   ret void
@@ -98,7 +98,7 @@ define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
 ; HSA: workgroup_group_segment_byte_size = 212
 ; HSA: group_segment_alignment = 4
-define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
@@ -114,7 +114,7 @@ define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addr
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
 ; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
-define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
@@ -142,7 +142,7 @@ define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addr
 ; HSA-LABEL: {{^}}test_round_size_3_order0:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
@@ -163,7 +163,7 @@ define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order1:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
@@ -184,7 +184,7 @@ define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order2:
 ; HSA: workgroup_group_segment_byte_size = 150
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -205,7 +205,7 @@ define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order3:
 ; HSA: workgroup_group_segment_byte_size = 118
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -226,7 +226,7 @@ define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order4:
 ; HSA: workgroup_group_segment_byte_size = 142
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
@@ -247,7 +247,7 @@ define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order5:
 ; HSA: workgroup_group_segment_byte_size = 126
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll
index 9875814b03d3..254673d8a1e4 100644
--- a/test/CodeGen/AMDGPU/lds-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -5,7 +5,7 @@
 
 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
 
-define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
+define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
  %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
   %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
index 078d6330ce04..1b3eeed3005c 100644
--- a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -18,7 +18,7 @@
 
 ; GCN: BB0_3:
 ; GCN-NEXT: s_endpgm
-define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
+define amdgpu_kernel void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/test/CodeGen/AMDGPU/lds-oqap-crash.ll
index 6ff6fc3d7afc..fff2a9200729 100644
--- a/test/CodeGen/AMDGPU/lds-oqap-crash.ll
+++ b/test/CodeGen/AMDGPU/lds-oqap-crash.ll
@@ -10,7 +10,7 @@
 ; reads and writes are bundled together in the same instruction.
 
 ; CHECK: {{^}}lds_crash:
-define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = load i32, i32 addrspace(3)* %in
   ; This block needs to be > 115 ISA instructions to hit the bug,
diff --git a/test/CodeGen/AMDGPU/lds-output-queue.ll b/test/CodeGen/AMDGPU/lds-output-queue.ll
index abe472e423fc..8b7e9e6d6aa8 100644
--- a/test/CodeGen/AMDGPU/lds-output-queue.ll
+++ b/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -10,7 +10,7 @@
 
 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
 
-define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
   %1 = load i32, i32 addrspace(3)* %0
@@ -88,7 +88,7 @@ declare void @llvm.r600.group.barrier() nounwind convergent
 ; CHECK: LDS_READ_RET
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
-define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
   %1 = load i32, i32 addrspace(3)* %0
diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll
index 1607713090e3..c65817abd489 100644
--- a/test/CodeGen/AMDGPU/lds-size.ll
+++ b/test/CodeGen/AMDGPU/lds-size.ll
@@ -14,7 +14,7 @@
 ; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only)
 @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
-define void @test(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index cb5d73fb0d8b..53c1c727a19d 100644
--- a/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -5,7 +5,7 @@
 
 @lds = addrspace(3) global [256 x i32] zeroinitializer
 
-define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
+define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
  %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
   %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
index 4244c48d240e..e85a1b690af6 100644
--- a/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
+++ b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
@@ -11,7 +11,7 @@
 ; CHECK: {{^}}setcc_expand:
 ; CHECK: SET
 ; CHECK-NOT: CND
-define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp eq i32 %in, 5
   br i1 %0, label %IF, label %ENDIF
diff --git a/test/CodeGen/AMDGPU/limit-coalesce.mir b/test/CodeGen/AMDGPU/limit-coalesce.mir
new file mode 100644
index 000000000000..106a96e32dc3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -0,0 +1,71 @@
+# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s
+
+# Check that coalescer does not create wider register tuple than in source
+
+# CHECK:  - { id: 2, class: vreg_64 }
+# CHECK:  - { id: 3, class: vreg_64 }
+# CHECK:  - { id: 4, class: vreg_64 }
+# CHECK:  - { id: 5, class: vreg_96 }
+# CHECK:  - { id: 6, class: vreg_96 }
+# CHECK:  - { id: 7, class: vreg_128 }
+# CHECK:  - { id: 8, class: vreg_128 }
+# No more registers shall be defined
+# CHECK-NEXT: liveins:
+# CHECK:    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %4,
+# CHECK:    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %6,
+
+---
+name:            main
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 1, class: sreg_32_xm0, preferred-register: '%1' }
+  - { id: 2, class: vreg_64, preferred-register: '%2' }
+  - { id: 3, class: vreg_64 }
+  - { id: 4, class: vreg_64 }
+  - { id: 5, class: vreg_64 }
+  - { id: 6, class: vreg_96 }
+  - { id: 7, class: vreg_96 }
+  - { id: 8, class: vreg_128 }
+  - { id: 9, class: vreg_128 }
+liveins:
+  - { reg: '%sgpr6', virtual-reg: '%1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    liveins: %sgpr0, %vgpr0_vgpr1
+
+    %3 = IMPLICIT_DEF
+    undef %4.sub0 = COPY %sgpr0
+    %4.sub1 = COPY %3.sub0
+    undef %5.sub0 = COPY %4.sub1
+    %5.sub1 = COPY %4.sub0
+    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+    %6 = IMPLICIT_DEF
+    undef %7.sub0_sub1 = COPY %6
+    %7.sub2 = COPY %3.sub0
+    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+    %8 = IMPLICIT_DEF
+    undef %9.sub0_sub1_sub2 = COPY %8
+    %9.sub3 = COPY %3.sub0
+    FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, 0, implicit %exec, implicit %flat_scr
+...
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
index 82fbb7f46186..1c546ba9f74b 100644
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -10,7 +10,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 5
-define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 5, %in
   store i32 %0, i32 addrspace(1)* %out
@@ -27,7 +27,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 1084227584(5.0
-define void @float_literal(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fadd float 5.0, %in
   store float %0, float addrspace(1)* %out
@@ -41,7 +41,7 @@ entry:
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
 
-define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@ entry:
 ; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
 ; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
 ; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
-define void @inline_literal_dot4(float addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_dot4(float addrspace(1)* %out) {
 entry:
   %0 = call float @llvm.r600.dot4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store float %0, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/liveness.mir b/test/CodeGen/AMDGPU/liveness.mir
index 112c3f8e69a6..48762e3f2ab4 100644
--- a/test/CodeGen/AMDGPU/liveness.mir
+++ b/test/CodeGen/AMDGPU/liveness.mir
@@ -8,7 +8,7 @@
 # Should see three distinct value numbers:
 # CHECK: %vreg0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}}
 --- |
-  define void @test0() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
 ...
 ---
 name: test0
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
deleted file mode 100644
index 77dd4b134982..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
+++ /dev/null
@@ -1,437 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg:
-; SI: v_bfe_i32
-; EG: BFE_INT
-; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
-define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm:
-; SI: v_bfe_i32
-; EG: BFE_INT
-define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg:
-; SI: v_bfe_i32
-; EG: BFE_INT
-define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg:
-; SI: v_bfe_i32
-; EG: BFE_INT
-define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_bfe_print_arg:
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
-define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
-  %load = load i32, i32 addrspace(1)* %src0, align 4
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_6:
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: s_endpgm
-define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_7:
-; SI-NOT: shl
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_8:
-; SI: buffer_load_dword
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; SI: s_endpgm
-define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_13:
-; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = ashr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_14:
-; SI-NOT: lshr
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = lshr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24:
-; SI: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI-NOT: v_lshl
-; SI-NOT: v_ashr
-; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
-; SI: buffer_store_dword [[BFE]],
-define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
-  %shl = shl i32 %bfe, 8
-  %ashr = ashr i32 %shl, 8
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @simplify_demanded_bfe_sdiv
-; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
-; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
-; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
-; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
-; SI: buffer_store_dword [[TMP2]]
-define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %src = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
-  %div = sdiv i32 %bfe, 2
-  store i32 %div, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
deleted file mode 100644
index ee47b14c496d..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
+++ /dev/null
@@ -1,631 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zextload_i8:
-; SI: buffer_load_ubyte
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %load = load i8, i8 addrspace(1)* %in
-  %ext = zext i8 %load to i32
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
-; GCN: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32
-; FIXME: Should be using s_add_i32
-; VI: v_add_i32
-; VI-NEXT: v_and_b32_e32
-; SI-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 65535
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
-; SI-NEXT: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
-; SI-NEXT: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 65535
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_1:
-; SI: buffer_load_dword
-; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI: s_endpgm
-; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
-define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_4:
-; SI-NOT: lshl
-; SI-NOT: shr
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %shr = lshr i32 %shl, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_5:
-; SI: buffer_load_dword
-; SI-NOT: lshl
-; SI-NOT: shr
-; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
-; SI: s_endpgm
-define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %shr = ashr i32 %shl, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_6:
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: s_endpgm
-define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_7:
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_8:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_13:
-; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = ashr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_14:
-; SI-NOT: lshr
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = lshr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Make sure that SimplifyDemandedBits doesn't cause the and to be
-; reduced to the bits demanded by the bfe.
-
-; XXX: The operand to v_bfe_u32 could also just directly be the load register.
-; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg:
-; SI: buffer_load_dword [[ARG:v[0-9]+]]
-; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]]
-; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2
-; SI-DAG: buffer_store_dword [[AND]]
-; SI-DAG: buffer_store_dword [[BFE]]
-; SI: s_endpgm
-define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
-                                            i32 addrspace(1)* %out1,
-                                            i32 addrspace(1)* %in) nounwind {
-  %src = load i32, i32 addrspace(1)* %in, align 4
-  %and = and i32 %src, 63
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
-  store i32 %and, i32 addrspace(1)* %out1, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lshr_and:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
-; SI: buffer_store_dword
-define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = lshr i32 %a, 6
-  %c = and i32 %b, 7
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_lshr_and:
-; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
-; SI: buffer_store_dword
-define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %c = lshr i32 %a, %b
-  %d = and i32 %c, 7
-  store i32 %d, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}and_lshr:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
-; SI: buffer_store_dword
-define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = and i32 %a, 448
-  %c = lshr i32 %b, 6
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}and_lshr2:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
-; SI: buffer_store_dword
-define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = and i32 %a, 511
-  %c = lshr i32 %b, 6
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}shl_lshr:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
-; SI: buffer_store_dword
-define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = shl i32 %a, 9
-  %c = lshr i32 %b, 11
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
deleted file mode 100644
index 2336109f4dad..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.fabs.f32(float) nounwind readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
-
-; FUNC-LABEL: {{^}}clamp_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0 clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-
-; EG: MOV_SAT
-define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], |[[ARG]]|, 0 clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -[[ARG]], 0 clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %src.fneg = fsub float -0.0, %src
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -|[[ARG]]|, 0 clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
-  %src.fneg.fabs = fsub float -0.0, %src.fabs
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
index 59997d27683d..595f632b493d 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
@@ -4,15 +4,14 @@
 ; SI-LABEL: {{^}}kill_gs_const:
 ; SI-NOT: v_cmpx_le_f32
 ; SI: s_mov_b64 exec, 0
-
 define amdgpu_gs void @kill_gs_const() {
 main_body:
-  %0 = icmp ule i32 0, 3
-  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %1)
-  %2 = icmp ule i32 3, 0
-  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %3)
+  %tmp = icmp ule i32 0, 3
+  %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %tmp1)
+  %tmp2 = icmp ule i32 3, 0
+  %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %tmp3)
   ret void
 }
 
@@ -21,16 +20,16 @@ main_body:
 ; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
 ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
+define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) {
 entry:
-  %tmp0 = fcmp olt float %13, 0.0
-  call void @llvm.AMDGPU.kill(float %14)
-  %tmp1 = select i1 %tmp0, float 1.0, float 0.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
+  %tmp0 = fcmp olt float %arg13, 0.000000e+00
+  call void @llvm.AMDGPU.kill(float %arg14)
+  %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
   ret void
 }
 
-declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.AMDGPU.kill(float) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
-!0 = !{!"const", null, i32 1}
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.export.ll b/test/CodeGen/AMDGPU/llvm.SI.export.ll
deleted file mode 100644
index 23a32dcfd943..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.export.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #0
-
-; GCN-LABEL: {{^}}test_export_zeroes:
-; GCN: exp mrt0 off, off, off, off{{$}}
-; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes() #0 {
-
-  call void @llvm.SI.export(i32 0, i32 0, i32 0, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
-  call void @llvm.SI.export(i32 0, i32 0, i32 1, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
-  ret void
-}
-
-; FIXME: Should not set up registers for the unused source registers.
-
-; GCN-LABEL: {{^}}test_export_en_src0:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0() #0 {
-  call void @llvm.SI.export(i32 1, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src1:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1() #0 {
-  call void @llvm.SI.export(i32 2, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src2:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2() #0 {
-  call void @llvm.SI.export(i32 4, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src3:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3() #0 {
-  call void @llvm.SI.export(i32 8, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src0_src1:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1() #0 {
-  call void @llvm.SI.export(i32 3, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src0_src2:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2() #0 {
-  call void @llvm.SI.export(i32 5, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src0_src3:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3() #0 {
-  call void @llvm.SI.export(i32 9, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 9, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src0_src1_src2_src3:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_mrt7:
-; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_z:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_null:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_reserved10:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_reserved11:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_pos0:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_pos3:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_param0:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_param31:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_vm:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm() #0 {
-  call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-attributes #0 = { nounwind "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
deleted file mode 100644
index 9e7c3c2e6201..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
-;RUN: llc < %s -march=amdgcn -mcpu=stoney -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-
-;GCN-LABEL: {{^}}main:
-;GCN-NOT: s_wqm
-;GCN: s_mov_b32 m0
-;GCN-DAG: v_interp_mov_f32
-;GCN-DAG: v_interp_p1_f32
-;GCN-DAG: v_interp_p2_f32
-
-define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) {
-main_body:
-  %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
-  %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4)
-  %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7)
-  ret void
-}
-
-; Thest that v_interp_p1 uses different source and destination registers
-; on 16 bank LDS chips.
-
-; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug:
-; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
-
-define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
-main_body:
-  %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7)
-  %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
-  %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7)
-  %25 = call float @fabs(float %22)
-  %26 = call float @fabs(float %23)
-  %27 = call float @fabs(float %24)
-  %28 = call i32 @llvm.SI.packf16(float %25, float %26)
-  %29 = bitcast i32 %28 to float
-  %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00)
-  %31 = bitcast i32 %30 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31)
-  ret void
-}
-
-; Function Attrs: readnone
-declare float @fabs(float) #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.constant(i32, i32, i32) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
deleted file mode 100644
index aef9f660436e..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
+++ /dev/null
@@ -1,525 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}gather4_v2:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_v2() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_cl:
-;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_l:
-;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b:
-;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_cl:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_cl_v8:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_cl_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_lz_v2:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_lz_v2() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_lz:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-;CHECK-LABEL: {{^}}gather4_o:
-;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_cl_o:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_cl_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_cl_o_v8:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_cl_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_l_o:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_l_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_l_o_v8:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_l_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_o:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_o_v8:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_cl_o:
-;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_cl_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_lz_o:
-;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_lz_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-;CHECK-LABEL: {{^}}gather4_c:
-;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_cl:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_cl_v8:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_cl_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_l:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_l_v8:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_l_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_v8:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_cl:
-;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_lz:
-;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-;CHECK-LABEL: {{^}}gather4_c_o:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_o_v8:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_cl_o:
-;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_cl_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_l_o:
-;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_l_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_o:
-;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_cl_o:
-;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b_cl_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_lz_o:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_lz_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_lz_o_v8:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_lz_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_sgpr_bug:
-;
-; This crashed at some point due to a bug in FixSGPRCopies. Derived from the
-; report in https://bugs.freedesktop.org/show_bug.cgi?id=96877
-;
-;CHECK: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
-;CHECK: s_waitcnt lgkmcnt(0)
-;CHECK: s_mov_b32 s[[LO]], 0
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]] dmask:0x8
-define amdgpu_ps float @gather4_sgpr_bug() {
-main_body:
-  %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef, align 16
-  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
-  %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp4 = extractelement <4 x float> %tmp2, i32 1
-  %tmp9 = fadd float undef, %tmp4
-  ret float %tmp9
-}
-
-declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
deleted file mode 100644
index ac34d31b97c1..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}getlod:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define amdgpu_ps void @getlod() {
-main_body:
-  %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}getlod_v2:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define amdgpu_ps void @getlod_v2() {
-main_body:
-  %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}getlod_v4:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define amdgpu_ps void @getlod_v4() {
-main_body:
-  %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.getlod.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.ll b/test/CodeGen/AMDGPU/llvm.SI.image.ll
deleted file mode 100644
index 50341e3e207f..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.image.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}image_load:
-;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @image_load() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}image_load_mip:
-;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @image_load_mip() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}getresinfo:
-;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @getresinfo() {
-main_body:
-  %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
deleted file mode 100644
index 7cdd9559994e..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s
-
-; CHECK-LABEL: {{^}}v1:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xd
-define amdgpu_ps void @v1(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 2
-  %4 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v2:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xb
-define amdgpu_ps void @v2(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 1
-  %4 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v3:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xe
-define amdgpu_ps void @v3(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 1
-  %3 = extractelement <4 x float> %1, i32 2
-  %4 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v4:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x7
-define amdgpu_ps void @v4(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 1
-  %4 = extractelement <4 x float> %1, i32 2
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v5:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xa
-define amdgpu_ps void @v5(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 1
-  %3 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v6:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x6
-define amdgpu_ps void @v6(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 1
-  %3 = extractelement <4 x float> %1, i32 2
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v7:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x9
-define amdgpu_ps void @v7(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
-  ret void
-}
-
-declare <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
deleted file mode 100644
index 60077dc218fd..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
+++ /dev/null
@@ -1,309 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}sample:
-;CHECK: s_wqm
-;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_d() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_d_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b:
-;CHECK: s_wqm
-;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cd() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cd_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c:
-;CHECK: s_wqm
-;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_d() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_d_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cd() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cd_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
deleted file mode 100644
index 34d4f6825690..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
+++ /dev/null
@@ -1,309 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}sample:
-;CHECK: s_wqm
-;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_d() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_d_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b:
-;CHECK: s_wqm
-;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cd() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cd_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c:
-;CHECK: s_wqm
-;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_d() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_d_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cd() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cd_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
index ee0a41f2210f..51f564d96909 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
@@ -34,8 +34,8 @@ main_body:
   %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
   %tmp23 = bitcast i32 %tmp22 to float
 
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23)
+  call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp13, float %tmp15, float %tmp17, float %tmp19, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp21, float %tmp23, float %tmp23, float %tmp23, i1 true, i1 false)
   ret void
 }
 
@@ -45,9 +45,10 @@ declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i3
 ; Function Attrs: nounwind readonly
 declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 
 attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind inaccessiblememonly }
 
 !0 = !{!"const", !1, i32 1}
 !1 = !{!"tbaa root"}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
deleted file mode 100644
index 6984b4cf488a..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}main:
-; GCN: v_cvt_pkrtz_f16_f32
-; GCN: v_cvt_pkrtz_f16_f32
-; GCN-NOT: v_cvt_pkrtz_f16_f32
-
-define amdgpu_ps void @main(float %src) {
-main_body:
-  %p1 = call i32 @llvm.SI.packf16(float undef, float %src)
-  %p2 = call i32 @llvm.SI.packf16(float %src, float undef)
-  %p3 = call i32 @llvm.SI.packf16(float undef, float undef)
-  %f1 = bitcast i32 %p1 to float
-  %f2 = bitcast i32 %p2 to float
-  %f3 = bitcast i32 %p3 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f1, float undef, float %f1)
-  call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f2, float undef, float %f2)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %f3, float undef, float %f2)
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index 9c845e84bc12..56966a19cf7b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -1,21 +1,45 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2
 
-declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2
-declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2
-declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
+; Make sure no crash on invalid non-constant
+; GCN-LABEL: {{^}}invalid_variable_order_lds_atomic_dec_ret_i32:
+define amdgpu_kernel void @invalid_variable_order_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %order.var) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 %order.var, i32 0, i1 false)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; Make sure no crash on invalid non-constant
+; GCN-LABEL: {{^}}invalid_variable_scope_lds_atomic_dec_ret_i32:
+define amdgpu_kernel void @invalid_variable_scope_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %scope.var) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 %scope.var, i1 false)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; Make sure no crash on invalid non-constant
+; GCN-LABEL: {{^}}invalid_variable_volatile_lds_atomic_dec_ret_i32:
+define amdgpu_kernel void @invalid_variable_volatile_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i1 %volatile.var) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 %volatile.var)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -23,9 +47,9 @@ define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -35,25 +59,25 @@ define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
-define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -61,26 +85,26 @@ define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32:
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -88,12 +112,12 @@ define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out.gep
   ret void
 }
@@ -102,19 +126,19 @@ define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out
   ret void
 }
@@ -122,38 +146,38 @@ define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32:
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out.gep
   ret void
 }
@@ -161,11 +185,11 @@ define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 a
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -173,8 +197,8 @@ define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out
   ret void
 }
@@ -183,9 +207,9 @@ define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out
   ret void
 }
@@ -194,8 +218,8 @@ define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspac
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -203,35 +227,35 @@ define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -240,11 +264,11 @@ define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0
 ; SI-LABEL: {{^}}atomic_dec_shl_base_lds_0:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]] offset:8
-define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9)
+  %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
   store i32 %idx.0, i32 addrspace(1)* %add_use
   store i32 %val0, i32 addrspace(1)* %out
   ret void
@@ -254,8 +278,8 @@ define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -264,9 +288,9 @@ define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
-define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -275,8 +299,8 @@ define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -284,9 +308,9 @@ define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
-define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -294,8 +318,8 @@ define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -304,9 +328,9 @@ define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -315,8 +339,8 @@ define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrsp
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -324,37 +348,37 @@ define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -363,11 +387,11 @@ define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #
 ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9)
+  %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
   store i32 %idx.0, i32 addrspace(1)* %add_use
   store i64 %val0, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index 22097418eec4..3d64f93db2e4 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -1,21 +1,21 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
-declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
-declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2
 
-declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2
-declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2
-declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -23,9 +23,9 @@ define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -35,25 +35,25 @@ define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
-define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -61,26 +61,26 @@ define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32:
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -88,12 +88,12 @@ define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out.gep
   ret void
 }
@@ -102,11 +102,11 @@ define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -115,11 +115,11 @@ define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9)
+  %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
   store i32 %idx.0, i32 addrspace(1)* %add_use
   store i32 %val0, i32 addrspace(1)* %out
   ret void
@@ -129,8 +129,8 @@ define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -139,9 +139,9 @@ define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
-define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -150,8 +150,8 @@ define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -159,9 +159,9 @@ define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
-define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -169,8 +169,8 @@ define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -179,9 +179,9 @@ define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -190,8 +190,8 @@ define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrsp
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -199,45 +199,45 @@ define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out
   ret void
 }
@@ -245,38 +245,38 @@ define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32:
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out.gep
   ret void
 }
@@ -284,11 +284,11 @@ define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 a
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -297,31 +297,22 @@ define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9)
+  %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
   store i32 %idx.0, i32 addrspace(1)* %add_use
   store i64 %val0, i64 addrspace(1)* %out
   ret void
 }
 
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind argmemonly }
-
-
-
-
-
-
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out
   ret void
 }
@@ -330,9 +321,9 @@ define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out
   ret void
 }
@@ -341,8 +332,8 @@ define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspac
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -350,34 +341,38 @@ define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind argmemonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
index 6d9db65e7d93..10bea8ea63b0 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
@@ -8,7 +8,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1() #0
 ; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00]
 ; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_buffer_wbinvl1() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1()
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
index 746298465e58..fe60d16d90f7 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
@@ -6,7 +6,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0
 ; SI-NEXT: ; BB#0:
 ; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
 ; SI-NEXT: s_endpgm
-define void @test_buffer_wbinvl1_sc() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1.sc()
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
index 4e0f3c37f214..061c1469ed4d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
@@ -8,7 +8,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1.vol() #0
 ; CI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
 ; VI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00]
 ; GCN: s_endpgm
-define void @test_buffer_wbinvl1_vol() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1_vol() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1.vol()
 ; This used to crash in hazard recognizer
   store i8 0, i8 addrspace(1)* undef, align 1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index 011a0fdbd219..f08d4b6c7915 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -10,7 +10,7 @@ declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @class_f16(
+define amdgpu_kernel void @class_f16(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     i32 addrspace(1)* %b) {
@@ -31,7 +31,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fabs(
+define amdgpu_kernel void @class_f16_fabs(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -46,12 +46,12 @@ entry:
 ; GCN-LABEL: {{^}}class_f16_fneg
 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
 ; GCN: s_load_dword s[[SB_I32:[0-9]+]]
-; VI:  v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]]
-; VI:  v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -v[[VA_F16]], s[[SB_I32]]
+; VI:  v_trunc_f16_e64 v[[VA_F16:[0-9]+]], -s[[SA_F16]]
+; VI:  v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[VA_F16]], s[[SB_I32]]
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fneg(
+define amdgpu_kernel void @class_f16_fneg(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -71,7 +71,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fabs_fneg(
+define amdgpu_kernel void @class_f16_fabs_fneg(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -91,7 +91,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_1(
+define amdgpu_kernel void @class_f16_1(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -108,7 +108,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_64(
+define amdgpu_kernel void @class_f16_64(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -126,7 +126,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_full_mask(
+define amdgpu_kernel void @class_f16_full_mask(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -144,7 +144,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_nine_bit_mask(
+define amdgpu_kernel void @class_f16_nine_bit_mask(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 668c669e41e8..1fcdac537fba 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -14,7 +14,7 @@ declare double @llvm.fabs.f64(double) #1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -29,7 +29,7 @@ define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -45,7 +45,7 @@ define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fneg = fsub float -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -61,7 +61,7 @@ define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %a.fneg.fabs = fsub float -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
@@ -76,7 +76,7 @@ define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b)
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@ define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -104,7 +104,7 @@ define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -118,7 +118,7 @@ define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -132,7 +132,7 @@ define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -150,7 +150,7 @@ define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -170,7 +170,7 @@ define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -190,7 +190,7 @@ define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i3
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -205,7 +205,7 @@ define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -221,7 +221,7 @@ define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fneg = fsub double -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -237,7 +237,7 @@ define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %a.fneg.fabs = fsub double -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
@@ -249,7 +249,7 @@ define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b)
 ; SI-LABEL: {{^}}test_class_1_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
 ; SI: s_endpgm
-define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -259,7 +259,7 @@ define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI-LABEL: {{^}}test_class_64_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
 ; SI: s_endpgm
-define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -275,7 +275,7 @@ define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -290,7 +290,7 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -306,7 +306,7 @@ define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace
 ; XSI: v_cmp_class_f64_e32 vcc, 1.0,
 ; SI: v_cmp_class_f64_e32 vcc,
 ; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -321,7 +321,7 @@ define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %
 ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
 ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -338,7 +338,7 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -358,7 +358,7 @@ define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -381,7 +381,7 @@ define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1
 ; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -416,7 +416,7 @@ define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float ad
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -436,7 +436,7 @@ define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -456,7 +456,7 @@ define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)
 ; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
 ; SI: s_or_b64
 ; SI: s_endpgm
-define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+define amdgpu_kernel void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -476,7 +476,7 @@ define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -488,7 +488,7 @@ define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -500,7 +500,7 @@ define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI-NOT: v_cmp_class
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1,
 ; SI: buffer_store_dword
-define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
index 410ac59279a5..054388607293 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.cos.f16(half %a)
 ; VI:  v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @cos_f16(
+define amdgpu_kernel void @cos_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
index f6495d8155f7..5b9c83c11cf4 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cos.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_cos_f32:
 ; GCN: v_cos_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
   %cos = call float @llvm.amdgcn.cos.f32(float %src) #0
   store float %cos, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
index 22bed45ee30f..dadb070bdcf8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubeid(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubeid:
 ; GCN: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
index 565f22c5d5b6..60c4618a011b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubema(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubema:
 ; GCN: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
index a3ba32745814..10669cf99138 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubesc(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubesc:
 ; GCN: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
index d3c0f2851ead..b2770308c170 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubetc(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubetc:
 ; GCN: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
new file mode 100644
index 000000000000..b92eb34750d9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -0,0 +1,166 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+
+; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
+  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
+  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Folds to 0 on gfx9
+; GCN-LABEL: {{^}}s_cvt_pkrtz_undef_undef:
+; GCN-NEXT: ; BB#0
+; SI-NEXT: s_endpgm
+; VI-NEXT: s_endpgm
+; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
+  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fabs.a = call float @llvm.fabs.f32(float %a)
+  %neg.fabs.a = fsub float -0.0, %fabs.a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
index 6c09aa592447..58250de2f891 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
@@ -9,7 +9,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @dispatch_id(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
   %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
   store i64 %tmp0, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index 2e8625256f13..92208e7fe17c 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_dispatch_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
   %value = load i32, i32 addrspace(2)* %header_ptr
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index 6d262cf497ac..e04d9e662cea 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -9,7 +9,7 @@ declare half @llvm.amdgcn.div.fixup.f16(half %a, half %b, half %c)
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16(
+define amdgpu_kernel void @div_fixup_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -30,7 +30,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a(
+define amdgpu_kernel void @div_fixup_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -49,7 +49,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_b(
+define amdgpu_kernel void @div_fixup_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -68,7 +68,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -86,7 +86,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a_imm_b(
+define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %c) {
 entry:
@@ -102,7 +102,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_b_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_b_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -118,7 +118,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_a_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index cc1504f2bc8d..b8fcacf46bba 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -16,7 +16,7 @@ declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readn
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -24,7 +24,7 @@ define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, fl
 
 ; GCN-LABEL: {{^}}test_div_fixup_f64:
 ; GCN: v_div_fixup_f64
-define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
   %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index d408fe9f87f6..a86468b07a27 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -20,7 +20,7 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind re
 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -34,7 +34,7 @@ define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, flo
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -48,7 +48,7 @@ define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a,
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -62,7 +62,7 @@ define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a,
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -70,7 +70,7 @@ define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a,
 
 ; GCN-LABEL: {{^}}test_div_fmas_f64:
 ; GCN: v_div_fmas_f64
-define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -79,7 +79,7 @@ define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b,
 ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
 ; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@ define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, f
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
 ; SI: s_mov_b64 vcc, 0
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -98,7 +98,7 @@ define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, f
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
 ; SI: s_mov_b64 vcc, -1
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -114,7 +114,7 @@ define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, fl
 ; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
 ; SI: s_endpgm
-define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
@@ -150,7 +150,7 @@ define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, flo
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 8e5c62c31db5..0b4f09ac6517 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -11,7 +11,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -31,7 +31,7 @@ define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)*
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -51,7 +51,7 @@ define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)*
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -71,7 +71,7 @@ define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -91,7 +91,7 @@ define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -109,7 +109,7 @@ define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -127,7 +127,7 @@ define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -145,7 +145,7 @@ define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -163,7 +163,7 @@ define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -181,7 +181,7 @@ define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -199,7 +199,7 @@ define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -217,7 +217,7 @@ define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -236,7 +236,7 @@ define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -250,7 +250,7 @@ define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a,
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -265,7 +265,7 @@ define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a,
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -280,7 +280,7 @@ define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -292,7 +292,7 @@ define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
@@ -308,7 +308,7 @@ define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float a
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
@@ -326,7 +326,7 @@ define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float a
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -349,7 +349,7 @@ define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspa
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index 92d3fc8b107e..08f286a7f510 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -4,8 +4,7 @@ declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
 
 ; FUNC-LABEL: {{^}}ds_bpermute:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,8 +12,7 @@ define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind
 
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
@@ -23,8 +21,7 @@ define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32
 
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
index 6d9c94191535..63618c3aed77 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
@@ -4,8 +4,7 @@ declare i32 @llvm.amdgcn.ds.permute(i32, i32) #0
 
 ; CHECK-LABEL: {{^}}ds_permute:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,8 +12,7 @@ define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
 
 ; CHECK-LABEL: {{^}}ds_permute_imm_offset:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
index ef3cb00024bb..a3a78d326a62 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0
 ; FUNC-LABEL: {{^}}ds_swizzle:
 ; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   store i32 %swizzle, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
new file mode 100644
index 000000000000..b972ddb8cb77
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
@@ -0,0 +1,162 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare void @llvm.amdgcn.exp.compr.v2i16(i32, i32, <2 x i16>, <2 x i16>, i1, i1) #0
+
+; GCN-LABEL: {{^}}test_export_compr_zeroes_v2f16:
+; GCN: exp mrt0 off, off, off, off compr{{$}}
+; GCN: exp mrt0 off, off, off, off done compr{{$}}
+define amdgpu_kernel void @test_export_compr_zeroes_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_src0_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], off, off done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src0_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_src1_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
+; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src1_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_src0_src1_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src0_src1_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_invalid2_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
+; GCN: exp mrt0 off, [[SRC0]], off, off done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_invalid2_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_invalid10_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
+; GCN: exp mrt0 off, [[SRC0]], off, [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_invalid10_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 10, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_mrt7_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0x38003800
+; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] compr{{$}}
+; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_mrt7_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> <half 0.5, half 0.5>, <2 x half> <half 0.5, half 0.5>, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> <half 0.5, half 0.5>, <2 x half> <half 0.5, half 0.5>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_z_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
+; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}}
+; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_z_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_vm_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}}
+define amdgpu_kernel void @test_export_compr_vm_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 true)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 true)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_zeroes_v2i16:
+; GCN: exp mrt0 off, off, off, off compr{{$}}
+; GCN: exp mrt0 off, off, off, off done compr{{$}}
+define amdgpu_kernel void @test_export_compr_zeroes_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_src0_v2i16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
+; GCN: exp mrt0 [[SRC0]], off, off, off done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src0_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 1, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_src1_v2i16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
+; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src1_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 12, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_src0_src1_v2i16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src0_src1_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_mrt7_v2i16:
+; GCN-DAG: v_mov_b32_e32 [[VI16:v[0-9]+]], 0x50005
+; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] compr{{$}}
+; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_mrt7_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> <i16 5, i16 5>, <2 x i16> <i16 5, i16 5>, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> <i16 5, i16 5>, <2 x i16> <i16 5, i16 5>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_z_v2i16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
+; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}}
+; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_z_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_vm_v2i16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}}
+define amdgpu_kernel void @test_export_compr_vm_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 true)
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 true)
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
new file mode 100644
index 000000000000..6d2de108829d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -0,0 +1,484 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
+declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
+
+; GCN-LABEL: {{^}}test_export_zeroes_f32:
+; GCN: exp mrt0 off, off, off, off{{$}}
+; GCN: exp mrt0 off, off, off, off done{{$}}
+define amdgpu_kernel void @test_export_zeroes_f32() #0 {
+
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false)
+  ret void
+}
+
+; FIXME: Should not set up registers for the unused source registers.
+
+; GCN-LABEL: {{^}}test_export_en_src0_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
+define amdgpu_kernel void @test_export_en_src0_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src1_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
+define amdgpu_kernel void @test_export_en_src1_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src2_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
+define amdgpu_kernel void @test_export_en_src2_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src3_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_en_src3_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src1_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
+define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src2_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
+define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src3_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
+; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src1_src2_src3_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_mrt7_f32:
+; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
+; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
+; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
+define amdgpu_kernel void @test_export_mrt7_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_z_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_z_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_null_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_null_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_reserved10_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_reserved10_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_reserved11_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_reserved11_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_pos0_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_pos0_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_pos3_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_pos3_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_param0_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_param0_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_param31_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_param31_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_vm_f32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
+define amdgpu_kernel void @test_export_vm_f32() #0 {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 true)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 true)
+  ret void
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+; GCN-LABEL: {{^}}test_export_zeroes_i32:
+; GCN: exp mrt0 off, off, off, off{{$}}
+; GCN: exp mrt0 off, off, off, off done{{$}}
+define amdgpu_kernel void @test_export_zeroes_i32() #0 {
+
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 true, i1 false)
+  ret void
+}
+
+; FIXME: Should not set up registers for the unused source registers.
+
+; GCN-LABEL: {{^}}test_export_en_src0_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
+define amdgpu_kernel void @test_export_en_src0_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 1, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src1_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
+define amdgpu_kernel void @test_export_en_src1_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 2, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src2_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
+define amdgpu_kernel void @test_export_en_src2_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 4, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src3_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_en_src3_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 8, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src1_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
+define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src2_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
+define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 5, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src3_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
+; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_en_src0_src1_src2_src3_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_mrt7_i32:
+; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 5
+; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
+; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
+define amdgpu_kernel void @test_export_mrt7_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_z_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_z_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_null_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_null_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_reserved10_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_reserved10_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_reserved11_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_reserved11_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_pos0_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_pos0_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_pos3_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_pos3_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_param0_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_param0_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_param31_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
+; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
+define amdgpu_kernel void @test_export_param31_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_vm_i32:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2
+; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
+; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
+define amdgpu_kernel void @test_export_vm_i32() #0 {
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 true)
+  call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 true)
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind inaccessiblememonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
index 427ad5ef553d..c9993ee88369 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
@@ -5,9 +5,17 @@ declare i64 @llvm.amdgcn.fcmp.f32(float, float, i32) #0
 declare i64 @llvm.amdgcn.fcmp.f64(double, double, i32) #0
 declare float @llvm.fabs.f32(float) #0
 
+; GCN-LABEL: {{^}}v_fcmp_f32_dynamic_cc:
+; GCN: s_endpgm
+define amdgpu_kernel void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %src1, i32 %cc) {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float %src0, float %src1, i32 %cc)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_with_fabs:
 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
   store i64 %result, i64 addrspace(1)* %out
@@ -16,7 +24,7 @@ define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, |{{s[0-9]+}}|
-define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src_input, float %temp, i32 1)
@@ -26,7 +34,7 @@ define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, floa
 
 ; GCN-LABEL: {{^}}v_fcmp:
 ; GCN-NOT: v_cmp_eq_f32_e64
-define void @v_fcmp(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -34,7 +42,7 @@ define void @v_fcmp(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq:
 ; GCN: v_cmp_eq_f32_e64
-define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -42,7 +50,7 @@ define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_one:
 ; GCN: v_cmp_neq_f32_e64
-define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -50,7 +58,7 @@ define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ogt:
 ; GCN: v_cmp_gt_f32_e64
-define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -58,7 +66,7 @@ define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oge:
 ; GCN: v_cmp_ge_f32_e64
-define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -66,7 +74,7 @@ define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_olt:
 ; GCN: v_cmp_lt_f32_e64
-define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -74,7 +82,7 @@ define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ole:
 ; GCN: v_cmp_le_f32_e64
-define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -83,7 +91,7 @@ define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ueq:
 ; GCN: v_cmp_nlg_f32_e64
-define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -91,7 +99,7 @@ define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_une:
 ; GCN: v_cmp_neq_f32_e64
-define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -99,7 +107,7 @@ define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ugt:
 ; GCN: v_cmp_nle_f32_e64
-define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -107,7 +115,7 @@ define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_uge:
 ; GCN: v_cmp_nlt_f32_e64
-define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -115,7 +123,7 @@ define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ult:
 ; GCN: v_cmp_nge_f32_e64
-define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -123,7 +131,7 @@ define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ule:
 ; GCN: v_cmp_ngt_f32_e64
-define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -131,7 +139,7 @@ define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_oeq:
 ; GCN: v_cmp_eq_f64_e64
-define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +147,7 @@ define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_one:
 ; GCN: v_cmp_neq_f64_e64
-define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -147,7 +155,7 @@ define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ogt:
 ; GCN: v_cmp_gt_f64_e64
-define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -155,7 +163,7 @@ define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_oge:
 ; GCN: v_cmp_ge_f64_e64
-define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -163,7 +171,7 @@ define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_olt:
 ; GCN: v_cmp_lt_f64_e64
-define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -171,7 +179,7 @@ define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ole:
 ; GCN: v_cmp_le_f64_e64
-define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -179,7 +187,7 @@ define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ueq:
 ; GCN: v_cmp_nlg_f64_e64
-define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -187,7 +195,7 @@ define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_une:
 ; GCN: v_cmp_neq_f64_e64
-define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -195,7 +203,7 @@ define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ugt:
 ; GCN: v_cmp_nle_f64_e64
-define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -203,7 +211,7 @@ define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_uge:
 ; GCN: v_cmp_nlt_f64_e64
-define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -211,7 +219,7 @@ define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ult:
 ; GCN: v_cmp_nge_f64_e64
-define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -219,7 +227,7 @@ define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ule:
 ; GCN: v_cmp_ngt_f64_e64
-define void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
index 54d7848da3bf..248ee9904da0 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
@@ -8,7 +8,7 @@ declare float @llvm.amdgcn.fdiv.fast(float, float) #0
 ; CHECK: v_rcp_f32_e32
 ; CHECK: v_mul_f32_e32
 ; CHECK: v_mul_f32_e32
-define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
   %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
   store float %fdiv, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
new file mode 100644
index 000000000000..a4ae37b23c5f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_fmed3_f16:
+; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+  %src0.f16 = trunc i32 %src0.arg to i16
+  %src0 = bitcast i16 %src0.f16 to half
+  %src1.f16 = trunc i32 %src1.arg to i16
+  %src1 = bitcast i16 %src1.f16 to half
+  %src2.f16 = trunc i32 %src2.arg to i16
+  %src2 = bitcast i16 %src2.f16 to half
+  %mad = call half @llvm.amdgcn.fmed3.f16(half %src0, half %src1, half %src2)
+  store half %mad, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fmed3_srcmods_f16:
+; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
+define amdgpu_kernel void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+  %src0.f16 = trunc i32 %src0.arg to i16
+  %src0 = bitcast i16 %src0.f16 to half
+  %src1.f16 = trunc i32 %src1.arg to i16
+  %src1 = bitcast i16 %src1.f16 to half
+  %src2.f16 = trunc i32 %src2.arg to i16
+  %src2 = bitcast i16 %src2.f16 to half
+  %src0.fneg = fsub half -0.0, %src0
+  %src1.fabs = call half @llvm.fabs.f16(half %src1)
+  %src2.fabs = call half @llvm.fabs.f16(half %src2)
+  %src2.fneg.fabs = fsub half -0.0, %src2.fabs
+  %mad = call half @llvm.amdgcn.fmed3.f16(half %src0.fneg, half %src1.fabs, half %src2.fneg.fabs)
+  store half %mad, half addrspace(1)* %out
+  ret void
+}
+
+declare half @llvm.amdgcn.fmed3.f16(half, half, half) #0
+declare half @llvm.fabs.f16(half) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
new file mode 100644
index 000000000000..230e625ad45b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_fmed3:
+; GCN: v_med3_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+  %mad = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2)
+  store float %mad, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fmed3_srcmods:
+; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
+define amdgpu_kernel void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+  %src0.fneg = fsub float -0.0, %src0
+  %src1.fabs = call float @llvm.fabs.f32(float %src1)
+  %src2.fabs = call float @llvm.fabs.f32(float %src2)
+  %src2.fneg.fabs = fsub float -0.0, %src2.fabs
+  %mad = call float @llvm.amdgcn.fmed3.f32(float %src0.fneg, float %src1.fabs, float %src2.fneg.fabs)
+  store float %mad, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #0
+declare float @llvm.fabs.f32(float) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
index d5c1c0a0969b..b47d2dbc744d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
@@ -4,7 +4,7 @@
 
 ; GCN-LABEL: {{^}}test_mul_legacy_f32:
 ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -12,7 +12,7 @@ define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef0_f32:
 ; GCN: v_mul_legacy_f32_e32
-define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -20,7 +20,7 @@ define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef1_f32:
 ; GCN: v_mul_legacy_f32_e32
-define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -28,7 +28,7 @@ define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
 
 ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32:
 ; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |s{{[0-9]+}}|
-define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %result = call float @llvm.amdgcn.fmul.legacy(float %a.fabs, float %b.fabs)
@@ -40,7 +40,7 @@ define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float
 ; GCN-LABEL: {{^}}test_mad_legacy_f32:
 ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_f32_e32
-define void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
+define amdgpu_kernel void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %add = fadd float %mul, %c
   store float %add, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
index d8c1af036a34..026f6901fc7f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.fract.f16(half %a)
 ; VI:  v_fract_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fract_f16(
+define amdgpu_kernel void @fract_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
index a75267b8d693..d4f1c5fd9be7 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
@@ -6,7 +6,7 @@ declare double @llvm.amdgcn.fract.f64(double) #0
 
 ; GCN-LABEL: {{^}}v_fract_f32:
 ; GCN: v_fract_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float %src)
   store float %fract, float addrspace(1)* %out
   ret void
@@ -14,7 +14,7 @@ define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}v_fract_f64:
 ; GCN: v_fract_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
   %fract = call double @llvm.amdgcn.fract.f64(double %src)
   store double %fract, double addrspace(1)* %out
   ret void
@@ -22,9 +22,8 @@ define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}v_fract_undef_f32:
 ; GCN-NOT: v_fract_f32
-; GCN-NOT: v0
-; GCN: buffer_store_dword v0
-define void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
+; GCN-NOT: store_dword
+define amdgpu_kernel void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float undef)
   store float %fract, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
index 7521224058f3..dc3eb4ce191e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
@@ -6,7 +6,7 @@ declare i16 @llvm.amdgcn.frexp.exp.i16.f16(half %a)
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_I16]]
-define void @frexp_exp_f16(
+define amdgpu_kernel void @frexp_exp_f16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -21,7 +21,7 @@ entry:
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; VI:  v_bfe_i32 v[[R_I32:[0-9]+]], v[[R_I16]], 0, 16{{$}}
 ; GCN: buffer_store_dword v[[R_I32]]
-define void @frexp_exp_f16_sext(
+define amdgpu_kernel void @frexp_exp_f16_sext(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; VI:  v_and_b32_e32 v[[R_I32:[0-9]+]], 0xffff, v[[R_I16]]
 ; GCN: buffer_store_dword v[[R_I32]]
-define void @frexp_exp_f16_zext(
+define amdgpu_kernel void @frexp_exp_f16_zext(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
index 9c49f175f2b5..0d686147caf8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.frexp.exp.i32.f64(double) #0
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fabs.src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
@@ -25,7 +25,7 @@ define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fneg.fabs.src)
@@ -35,7 +35,7 @@ define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src)
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fabs.src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fneg.fabs.src)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
index 706537d7e21c..722cd44e99fb 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.frexp.mant.f16(half %a)
 ; VI:  v_frexp_mant_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @frexp_mant_f16(
+define amdgpu_kernel void @frexp_mant_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
index b8d63defffed..605dc3db2b98 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
@@ -8,7 +8,7 @@ declare double @llvm.amdgcn.frexp.mant.f64(double) #0
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %src)
   store float %frexp.mant, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fabs.src)
   store float %frexp.mant, float addrspace(1)* %out
@@ -25,7 +25,7 @@ define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fneg.fabs.src)
@@ -35,7 +35,7 @@ define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %sr
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %src)
   store double %frexp.mant, double addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fabs.src)
   store double %frexp.mant, double addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src)
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fneg.fabs.src)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
index 6014e2ed85f8..d26fab4cebe1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -9,7 +9,7 @@
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test0:
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
-define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
+define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
@@ -23,7 +23,7 @@ define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test1:
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
-define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
+define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
 entry:
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
   store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
@@ -51,7 +51,7 @@ endif:                                            ; preds = %else, %if
 ; Exceeds 16-bit simm limit of s_movk_i32
 ; CHECK-LABEL: {{^}}large_groupstaticsize:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
-define void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
+define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
   %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
   store volatile i32 0, i32 addrspace(3)* %gep
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
index 6d0457bc6489..aa04af7a64a9 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
@@ -4,9 +4,18 @@
 declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0
 declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #0
 
+; No crash on invalid input
+; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc:
+; GCN: s_endpgm
+define amdgpu_kernel void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 %cc)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}v_icmp_i32_eq:
 ; GCN: v_cmp_eq_u32_e64
-define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -14,14 +23,14 @@ define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp:
 ; GCN-NOT: v_cmp_eq_u32_e64
-define void @v_icmp(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i32_ne:
 ; GCN: v_cmp_ne_u32_e64
-define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -29,7 +38,7 @@ define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ugt:
 ; GCN: v_cmp_gt_u32_e64
-define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -37,7 +46,7 @@ define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_uge:
 ; GCN: v_cmp_ge_u32_e64
-define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -45,7 +54,7 @@ define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ult:
 ; GCN: v_cmp_lt_u32_e64
-define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -53,7 +62,7 @@ define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ule:
 ; GCN: v_cmp_le_u32_e64
-define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -61,7 +70,7 @@ define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_sgt:
 ; GCN: v_cmp_gt_i32_e64
-define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -69,7 +78,7 @@ define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_sge:
 ; GCN: v_cmp_ge_i32_e64
-define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -77,14 +86,14 @@ define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_slt:
 ; GCN: v_cmp_lt_i32_e64
-define void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i32_sle:
 ; GCN: v_cmp_le_i32_e64
-define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -92,7 +101,7 @@ define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_eq:
 ; GCN: v_cmp_eq_u64_e64
-define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -100,7 +109,7 @@ define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_ne:
 ; GCN: v_cmp_ne_u64_e64
-define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -108,7 +117,7 @@ define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ugt:
 ; GCN: v_cmp_gt_u64_e64
-define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -116,7 +125,7 @@ define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_uge:
 ; GCN: v_cmp_ge_u64_e64
-define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -124,7 +133,7 @@ define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ult:
 ; GCN: v_cmp_lt_u64_e64
-define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -132,7 +141,7 @@ define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ule:
 ; GCN: v_cmp_le_u64_e64
-define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -140,7 +149,7 @@ define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_sgt:
 ; GCN: v_cmp_gt_i64_e64
-define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -148,7 +157,7 @@ define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_sge:
 ; GCN: v_cmp_ge_i64_e64
-define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -156,14 +165,14 @@ define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_slt:
 ; GCN: v_cmp_lt_i64_e64
-define void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i64_sle:
 ; GCN: v_cmp_le_i64_e64
-define void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
index a65f422742c9..a9351dbb27d2 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}gather4_v2:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_cl:
 ; GCN: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_l:
 ; GCN: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b:
 ; GCN: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_cl:
 ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_cl_v8:
 ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_lz_v2:
 ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_lz:
 ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -86,7 +86,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_o:
 ; GCN: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -95,7 +95,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_cl_o:
 ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -104,7 +104,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_cl_o_v8:
 ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -113,7 +113,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_l_o:
 ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -122,7 +122,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_l_o_v8:
 ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -131,7 +131,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_o:
 ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -140,7 +140,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_o_v8:
 ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -149,7 +149,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_cl_o:
 ; GCN: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -158,7 +158,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_lz_o:
 ; GCN: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -168,7 +168,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c:
 ; GCN: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -177,7 +177,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_cl:
 ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -186,7 +186,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_cl_v8:
 ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -195,7 +195,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_l:
 ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -204,7 +204,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_l_v8:
 ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -213,7 +213,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b:
 ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -222,7 +222,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_v8:
 ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -231,7 +231,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_cl:
 ; GCN: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -240,7 +240,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_lz:
 ; GCN: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -250,7 +250,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_o:
 ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -259,7 +259,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_o_v8:
 ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -268,7 +268,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_cl_o:
 ; GCN: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -277,7 +277,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_l_o:
 ; GCN: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -286,7 +286,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_o:
 ; GCN: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -295,7 +295,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_cl_o:
 ; GCN: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -304,7 +304,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_lz_o:
 ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -313,7 +313,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_lz_o_v8:
 ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -322,7 +322,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_f32:
 ; GCN: image_gather4 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_f32(float addrspace(1)* %out) {
 main_body:
   %r = call float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store float %r, float addrspace(1)* %out
@@ -331,7 +331,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_v2f32:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define void @gather4_v2f32(<2 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_v2f32(<2 x float> addrspace(1)* %out) {
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <2 x float> %r, <2 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
index ef810a330017..2e78e2a4c6f5 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}getlod:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}getlod_v2:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,13 +21,23 @@ main_body:
 
 ; GCN-LABEL: {{^}}getlod_v4:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod_v4(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod_v4(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}adjust_writemask_getlod_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_getlod_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
 
 declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
index 69c43ca3070a..c74c0fa15855 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -1,146 +1,144 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=CHECK,VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
 
-;CHECK-LABEL: {{^}}image_load_v4i32:
-;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_v4i32:
+; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret <4 x float> %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_v2i32:
-;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_v2i32:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret <4 x float> %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_i32:
-;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) {
+; GCN-LABEL: {{^}}image_load_i32:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret <4 x float> %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_mip:
-;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_mip:
+; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret <4 x float> %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_1:
-;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_1:
+; GCN: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   %elt = extractelement <4 x float> %tex, i32 0
-; Only first component used, test that dmask etc. is changed accordingly
   ret float %elt
 }
 
-;CHECK-LABEL: {{^}}image_load_f32_v2i32:
-;CHECK: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_f32_v2i32:
+; GCN: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
 main_body:
-  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false)
   ret float %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_v2f32_v4i32:
-;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_v2f32_v4i32:
+; GCN: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false)
   ret <2 x float> %tex
 }
 
-
-;CHECK-LABEL: {{^}}image_store_v4i32:
-;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_v4i32:
+; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_v2i32:
-;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_v2i32:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_i32:
-;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) {
+; GCN-LABEL: {{^}}image_store_i32:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_f32_i32:
-;CHECK: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
-define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) {
+; GCN-LABEL: {{^}}image_store_f32_i32:
+; GCN: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_v2f32_v4i32:
-;CHECK: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
-define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_v2f32_v4i32:
+; GCN: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_mip:
-;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_mip:
+; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}getresinfo:
-;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @getresinfo() {
+; GCN-LABEL: {{^}}getresinfo:
+; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @getresinfo() #0 {
 main_body:
-  %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
   %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r0, float %r1, float %r2, float %r3, i1 true, i1 true) #0
   ret void
 }
 
 ; Ideally, the register allocator would avoid the wait here
 ;
-;CHECK-LABEL: {{^}}image_store_wait:
-;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0) expcnt(0)
-;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
-define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) {
+; GCN-LABEL: {{^}}image_store_wait:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0) expcnt(0)
+; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
+define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0)
-  %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0)
-  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %arg3, i32 %arg4, <8 x i32> %arg, i32 15, i1 false, i1 false, i1 false, i1 false)
+  %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %arg4, <8 x i32> %arg1, i32 15, i1 false, i1 false, i1 false, i1 false)
+  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %arg4, <8 x i32> %arg2, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
@@ -149,21 +147,22 @@ main_body:
 ; VI-LABEL: image_load_mmo
 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) {
-  store float 0.0, float addrspace(3)* %lds
-  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) #0 {
+bb:
+  store float 0.000000e+00, float addrspace(3)* %lds
+  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.0, float addrspace(3)* %tmp2
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tex, float %tex, float %tex, float %tex)
+  store float 0.000000e+00, float addrspace(3)* %tmp2
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex, float %tex, float %tex, float %tex, i1 true, i1 true) #0
   ret void
 }
 
 declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare void @llvm.amdgcn.image.store.f32.i32.v8i32(float, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
-declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 
 
+declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
@@ -173,10 +172,9 @@ declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32,
 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
 
-declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
index 752ec2d42fac..4f90b0a25eaa 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}sample:
 ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cl:
 ; GCN: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d:
 ; GCN: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d_cl:
 ; GCN: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_l:
 ; GCN: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b:
 ; GCN: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b_cl:
 ; GCN: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_lz:
 ; GCN: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd:
 ; GCN: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -84,7 +84,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd_cl:
 ; GCN: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -93,7 +93,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c:
 ; GCN: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -102,7 +102,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cl:
 ; GCN: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -111,7 +111,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d:
 ; GCN: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -120,7 +120,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d_cl:
 ; GCN: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -129,7 +129,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_l:
 ; GCN: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -138,7 +138,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b:
 ; GCN: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -147,7 +147,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b_cl:
 ; GCN: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -156,7 +156,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_lz:
 ; GCN: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -165,7 +165,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd:
 ; GCN: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -174,7 +174,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd_cl:
 ; GCN: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -183,7 +183,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_f32:
 ; GCN: image_sample {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
-define void @sample_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @sample_f32(float addrspace(1)* %out) {
 main_body:
   %r = call float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0)
   store float %r, float addrspace(1)* %out
@@ -192,13 +192,221 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_v2f32:
 ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3
-define void @sample_v2f32(<2 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_v2f32(<2 x float> addrspace(1)* %out) {
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <2 x float> %r, <2 x float> addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}adjust_writemask_sample_0:
+; GCN: image_sample v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_0(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_01:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x3{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_01(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  %elt1 = extractelement <4 x float> %r, i32 1
+  store volatile float %elt0, float addrspace(1)* %out
+  store volatile float %elt1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_012:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x7{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_012(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  %elt1 = extractelement <4 x float> %r, i32 1
+  %elt2 = extractelement <4 x float> %r, i32 2
+  store volatile float %elt0, float addrspace(1)* %out
+  store volatile float %elt1, float addrspace(1)* %out
+  store volatile float %elt2, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_12:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x6{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_12(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt1 = extractelement <4 x float> %r, i32 1
+  %elt2 = extractelement <4 x float> %r, i32 2
+  store volatile float %elt1, float addrspace(1)* %out
+  store volatile float %elt2, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_03:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x9{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_03(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  %elt3 = extractelement <4 x float> %r, i32 3
+  store volatile float %elt0, float addrspace(1)* %out
+  store volatile float %elt3, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_13:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xa{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_13(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt1 = extractelement <4 x float> %r, i32 1
+  %elt3 = extractelement <4 x float> %r, i32 3
+  store volatile float %elt1, float addrspace(1)* %out
+  store volatile float %elt3, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_123:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xe{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_123(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt1 = extractelement <4 x float> %r, i32 1
+  %elt2 = extractelement <4 x float> %r, i32 2
+  %elt3 = extractelement <4 x float> %r, i32 3
+  store volatile float %elt1, float addrspace(1)* %out
+  store volatile float %elt2, float addrspace(1)* %out
+  store volatile float %elt3, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_variable_dmask_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_variable_dmask_enabled(float addrspace(1)* %out, i32 %dmask) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 %dmask, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cl_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cl_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_d_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_d_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_d_cl_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_l_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_l_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_b_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_b_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_b_cl_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_lz_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_lz_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cd_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cd_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cd_cl_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
index d10fd0824692..42d7bc0e7778 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}sample:
 ; GCN: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cl:
 ; GCN: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d:
 ; GCN: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d_cl:
 ; GCN: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_l:
 ; GCN: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b:
 ; GCN: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b_cl:
 ; GCN: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_lz:
 ; GCN: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd:
 ; GCN: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -84,7 +84,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd_cl:
 ; GCN: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -93,7 +93,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c:
 ; GCN: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -102,7 +102,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cl:
 ; GCN: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -111,7 +111,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d:
 ; GCN: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -120,7 +120,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d_cl:
 ; GCN: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -129,7 +129,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_l:
 ; GCN: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -138,7 +138,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b:
 ; GCN: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -147,7 +147,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b_cl:
 ; GCN: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -156,7 +156,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_lz:
 ; GCN: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -165,7 +165,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd:
 ; GCN: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -174,13 +174,232 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd_cl:
 ; GCN: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}adjust_writemask_sample_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_d_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_d_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_d_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_l_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_l_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_b_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_b_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_b_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_lz_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_lz_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cd_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cd_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cd_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_d_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_d_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_l_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_l_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_b_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_b_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_lz_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_lz_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_cd_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_cd_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
 
 declare <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
index 9ba5c69a9a24..c4795a23cd5b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@@ -1,5 +1,7 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=kabini -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s
+; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s
 
 ; GCN-LABEL: {{^}}v_interp:
 ; GCN-NOT: s_wqm
@@ -8,17 +10,17 @@
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}}
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}}
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
-define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x float>) {
+define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x float> %arg4) #0 {
 main_body:
-  %i = extractelement <2 x float> %4, i32 0
-  %j = extractelement <2 x float> %4, i32 1
-  %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %3)
-  %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %3)
-  %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %3)
-  %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %3)
-  %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %3)
+  %i = extractelement <2 x float> %arg4, i32 0
+  %j = extractelement <2 x float> %arg4, i32 1
+  %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %arg3)
+  %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %arg3)
+  %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %arg3)
+  %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %arg3)
+  %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg3)
   %w = fadd float %p1_1, %const
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %w)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_0, float %p1_1, float %w, i1 true, i1 true) #0
   ret void
 }
 
@@ -37,7 +39,8 @@ main_body:
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.w{{$}}
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.w{{$}}
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}}
-define amdgpu_ps void @v_interp_p1(float %i) {
+define amdgpu_ps void @v_interp_p1(float %i) #0 {
+bb:
   %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 256)
   %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 256)
   %p0_2 = call float @llvm.amdgcn.interp.p1(float %i, i32 2, i32 0, i32 256)
@@ -77,7 +80,8 @@ define amdgpu_ps void @v_interp_p1(float %i) {
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.x{{$}}
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}}
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}}
-define amdgpu_ps void @v_interp_p2(float %x, float %j) {
+define amdgpu_ps void @v_interp_p2(float %x, float %j) #0 {
+bb:
   %p2_0 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 0, i32 0, i32 256)
   %p2_1 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 1, i32 0, i32 256)
   %p2_2 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 2, i32 0, i32 256)
@@ -118,7 +122,8 @@ define amdgpu_ps void @v_interp_p2(float %x, float %j) {
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p10, attr64.y{{$}}
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_3, attr64.y{{$}}
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_10, attr64.x{{$}}
-define amdgpu_ps void @v_interp_mov(float %x, float %j) {
+define amdgpu_ps void @v_interp_mov(float %x, float %j) #0 {
+bb:
   %mov_0 = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 256)
   %mov_1 = call float @llvm.amdgcn.interp.mov(i32 1, i32 0, i32 0, i32 256)
   %mov_2 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 256)
@@ -161,23 +166,57 @@ define amdgpu_ps void @v_interp_mov(float %x, float %j) {
 ; VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
 ; VI: s_mov_b32 m0, -1{{$}}
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) {
-  store float 0.0, float addrspace(3)* %lds
+define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
+bb:
+  store float 0.000000e+00, float addrspace(3)* %lds
   %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
   %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.0, float addrspace(3)* %tmp2
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
+  store float 0.000000e+00, float addrspace(3)* %tmp2
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
+; Thest that v_interp_p1 uses different source and destination registers
+; on 16 bank LDS chips.
 
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
-
-declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
+; GCN-LABEL: {{^}}v_interp_p1_bank16_bug:
+; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
+define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg13, [17 x <4 x i32>] addrspace(2)* byval %arg14, [34 x <8 x i32>] addrspace(2)* byval %arg15, float inreg %arg16, i32 inreg %arg17, <2 x i32> %arg18, <2 x i32> %arg19, <2 x i32> %arg20, <3 x i32> %arg21, <2 x i32> %arg22, <2 x i32> %arg23, <2 x i32> %arg24, float %arg25, float %arg26, float %arg27, float %arg28, float %arg29, float %arg30, i32 %arg31, float %arg32, float %arg33) #0 {
+main_body:
+  %i.i = extractelement <2 x i32> %arg19, i32 0
+  %j.i = extractelement <2 x i32> %arg19, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg17) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg17) #0
+  %i.i7 = extractelement <2 x i32> %arg19, i32 0
+  %j.i8 = extractelement <2 x i32> %arg19, i32 1
+  %i.f.i9 = bitcast i32 %i.i7 to float
+  %j.f.i10 = bitcast i32 %j.i8 to float
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 0, i32 %arg17) #0
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 0, i32 %arg17) #0
+  %i.i1 = extractelement <2 x i32> %arg19, i32 0
+  %j.i2 = extractelement <2 x i32> %arg19, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 0, i32 %arg17) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 0, i32 %arg17) #0
+  %tmp = call float @llvm.fabs.f32(float %p2.i)
+  %tmp34 = call float @llvm.fabs.f32(float %p2.i12)
+  %tmp35 = call float @llvm.fabs.f32(float %p2.i6)
+  %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp, float %tmp34)
+  %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp35, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 true) #0
+  ret void
+}
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
 
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 5d4d4cd7ee46..055dddbfa8af 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -8,7 +8,7 @@
 ; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa
 
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
-define void @test(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
   %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
   %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
@@ -20,7 +20,7 @@ define void @test(i32 addrspace(1)* %out) #1 {
 ; ALL-LABEL: {{^}}test_implicit:
 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
-define void @test_implicit(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
   %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
@@ -39,7 +39,7 @@ define void @test_implicit(i32 addrspace(1)* %out) #1 {
 ; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]]
 ; MESA: buffer_store_dword [[V_VAL]]
 ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
-define void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
+define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
   %arg.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %arg.ptr
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
index 6720cbe9d8da..fe211d356070 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I32]]
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16(
+define amdgpu_kernel void @ldexp_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     i32 addrspace(1)* %b) {
@@ -22,7 +22,7 @@ define void @ldexp_f16(
 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]]
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16_imm_a(
+define amdgpu_kernel void @ldexp_f16_imm_a(
     half addrspace(1)* %r,
     i32 addrspace(1)* %b) {
   %b.val = load i32, i32 addrspace(1)* %b
@@ -35,7 +35,7 @@ define void @ldexp_f16_imm_a(
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; VI: v_ldexp_f16_e64 v[[R_F16:[0-9]+]], v[[A_F16]], 2{{$}}
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16_imm_b(
+define amdgpu_kernel void @ldexp_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
   %a.val = load half, half addrspace(1)* %a
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
index a23defd742a8..1ab4e8b80630 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
@@ -7,7 +7,7 @@ declare double @llvm.amdgcn.ldexp.f64(double, i32) nounwind readnone
 ; SI-LABEL: {{^}}test_ldexp_f32:
 ; SI: v_ldexp_f32
 ; SI: s_endpgm
-define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -16,7 +16,7 @@ define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind
 ; SI-LABEL: {{^}}test_ldexp_f64:
 ; SI: v_ldexp_f64
 ; SI: s_endpgm
-define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
   %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -24,7 +24,7 @@ define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwi
 
 ; SI-LABEL: {{^}}test_ldexp_undef_f32:
 ; SI-NOT: v_ldexp_f32
-define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
index 014369b45015..bc599897f82a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_lerp:
 ; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
   %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
index f78257f1d226..feecd6c0e35d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
@@ -7,7 +7,7 @@ declare float @llvm.amdgcn.log.clamp.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_log_clamp_f32:
 ; GCN: v_log_clamp_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
   %log.clamp = call float @llvm.amdgcn.log.clamp.f32(float %src) #0
   store float %log.clamp, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
index 303446b63315..ab76c870796b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -1,24 +1,22 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI  %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}mbcnt_intrinsics:
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
 ; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
 ; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
-
-define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) {
+define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) {
 main_body:
-  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
-  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1
-  %4 = bitcast i32 %hi to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %4, float %4, float %4, float %4)
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #0
+  %tmp = bitcast i32 %hi to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp, float %tmp, float %tmp, float %tmp, i1 true, i1 true) #1
   ret void
 }
 
-declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
-
-declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
index 35fdba8f34a3..8baaad190406 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -7,7 +7,7 @@
 ; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
-define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   store i32 %tmp0, i32 addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
   store i32 %tmp1, i32 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
+define amdgpu_kernel void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
   %cmp = fcmp oeq float %cond, 0.0
   br i1 %cmp, label %if, label %else
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
index 7c2495e096ec..3a2b87cd87f3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -5,7 +5,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
 ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
   %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate:
 ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
   %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
index 04bb97a9eb57..a8d03bf6bbac 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@@ -5,7 +5,7 @@ declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
+define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -37,7 +37,7 @@ define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i6
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
+define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
   %in = load <4 x i32>, <4 x i32> addrspace(1) * %input
 
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
index 83d13ab26846..dfaac042227c 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.msad.u8(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_msad_u8:
 ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_msad_u8_non_immediate:
 ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
index fd1a463fd3e9..f0af876567b4 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
 
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
@@ -7,7 +7,7 @@
 ; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However,
 ; the expectation is that the intrinsic will be used in non-trivial shaders,
 ; so such an optimization doesn't seem worth the effort.
-define amdgpu_ps float @test1() {
+define amdgpu_ps float @test1() #0 {
   %live = call i1 @llvm.amdgcn.ps.live()
   %live.32 = zext i1 %live to i32
   %r = bitcast i32 %live.32 to float
@@ -19,12 +19,11 @@ define amdgpu_ps float @test1() {
 ; CHECK-DAG: s_wqm_b64 exec, exec
 ; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]]
 ; CHECK: image_sample v0, [[VAR]],
-define amdgpu_ps float @test2() {
+define amdgpu_ps float @test2() #0 {
   %live = call i1 @llvm.amdgcn.ps.live()
   %live.32 = zext i1 %live to i32
-
-  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %live.32.bc = bitcast i32 %live.32 to float
+  %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %live.32.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %r = extractelement <4 x float> %t, i32 0
   ret float %r
 }
@@ -35,7 +34,7 @@ define amdgpu_ps float @test2() {
 ; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1
 ; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]
 ; CHECK: ; %dead
-define amdgpu_ps float @test3(i32 %in) {
+define amdgpu_ps float @test3(i32 %in) #0 {
 entry:
   %live = call i1 @llvm.amdgcn.ps.live()
   br i1 %live, label %end, label %dead
@@ -46,14 +45,15 @@ dead:
 
 end:
   %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ]
-  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %tc.bc = bitcast i32 %tc to float
+  %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %tc.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %r = extractelement <4 x float> %t, i32 0
   ret float %r
 }
 
-declare i1 @llvm.amdgcn.ps.live() #0
-
-declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare i1 @llvm.amdgcn.ps.live() #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
index ece4224f6e67..be71225c5e06 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -5,7 +5,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
 ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
   %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate:
 ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
   %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
index 6bf871543ca2..9200fe7c67b1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_queue_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
   %value = load i32, i32 addrspace(2)* %header_ptr
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index f0b8e2a0293f..0f1fa15f47cc 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.rcp.f16(half %a)
 ; VI:  v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rcp_f16(
+define amdgpu_kernel void @rcp_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
index d53861456c78..71db76d902b7 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
@@ -7,7 +7,7 @@ declare float @llvm.amdgcn.rcp.legacy(float) #0
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32:
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float %src) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -16,7 +16,7 @@ define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
 ; TODO: Really these should be constant folded
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_4.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 4.0) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -24,7 +24,7 @@ define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_100.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 100.0) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -32,7 +32,7 @@ define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
 
 ; GCN-LABEL: {{^}}rcp_legacy_undef_f32:
 ; GCN-NOT: v_rcp_legacy_f32
-define void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float undef)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 825231bf8680..ad2d84b7911b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -8,17 +8,35 @@ declare float @llvm.sqrt.f32(float) #0
 
 ; FUNC-LABEL: {{^}}rcp_undef_f32:
 ; SI-NOT: v_rcp_f32
-define void @rcp_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_undef_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float undef)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
 
+; FUNC-LABEL: {{^}}rcp_2_f32:
+; SI-NOT: v_rcp_f32
+; SI: v_mov_b32_e32 v{{[0-9]+}}, 0.5
+define amdgpu_kernel void @rcp_2_f32(float addrspace(1)* %out) #1 {
+  %rcp = call float @llvm.amdgcn.rcp.f32(float 2.0)
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_10_f32:
+; SI-NOT: v_rcp_f32
+; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x3dcccccd
+define amdgpu_kernel void @rcp_10_f32(float addrspace(1)* %out) #1 {
+  %rcp = call float @llvm.amdgcn.rcp.f32(float 10.0)
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}safe_no_fp32_denormals_rcp_f32:
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -28,7 +46,7 @@ define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
+define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -36,7 +54,7 @@ define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src
 
 ; FUNC-LABEL: {{^}}unsafe_f32_denormals_rcp_pat_f32:
 ; SI: v_div_scale_f32
-define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
+define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -45,7 +63,7 @@ define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %s
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
 ; SI: v_sqrt_f32_e32
 ; SI: v_rcp_f32_e32
-define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
   store float %rcp, float addrspace(1)* %out, align 4
@@ -54,7 +72,7 @@ define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
 
 ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32:
 ; SI: v_rsq_f32_e32
-define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
   store float %rcp, float addrspace(1)* %out, align 4
@@ -65,7 +83,7 @@ define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -75,7 +93,7 @@ define void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -83,7 +101,7 @@ define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
 
 ; FUNC-LABEL: {{^}}rcp_pat_f64:
 ; SI: v_div_scale_f64
-define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
   %rcp = fdiv double 1.0, %src
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -93,7 +111,7 @@ define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
   %rcp = fdiv double 1.0, %src
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -103,7 +121,7 @@ define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
 ; SI-NOT: v_rsq_f64_e32
 ; SI: v_sqrt_f64
 ; SI: v_rcp_f64
-define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, double addrspace(1)* %out, align 8
@@ -114,7 +132,7 @@ define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
 ; SI: v_rsq_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 2569108e7b18..9f5c809455ea 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #0
 
 ; CHECK-LABEL: {{^}}test_readfirstlane:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
-define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
 ; CHECK-LABEL: {{^}}test_readfirstlane_imm:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
-define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@ define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
-define void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 436ffff692c6..5e892fad3741 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.readlane(i32, i32) #0
 
 ; CHECK-LABEL: {{^}}test_readlane_sreg:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1
 ; CHECK-LABEL: {{^}}test_readlane_imm_sreg:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
-define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@ define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
-define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
@@ -34,7 +34,7 @@ define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
 
 ; CHECK-LABEL: {{^}}test_readlane_imm:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-define void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 32) #0
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 5f40e0d0986f..3611047f1277 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -12,7 +12,7 @@ declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
 ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
 ; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]]
 ; VI: buffer_store_dword [[RESULT]]
-define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
   store float %rsq_clamp, float addrspace(1)* %out
   ret void
@@ -30,7 +30,7 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
 ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
 ; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
-define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   store double %rsq_clamp, double addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
 
 ; FUNC-LABEL: {{^}}rsq_clamp_undef_f32:
 ; SI-NOT: v_rsq_clamp_f32
-define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
   store float %rsq_clamp, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index 2022d0289862..fd4802140810 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.rsq.f16(half %a)
 ; VI:  v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rsq_f16(
+define amdgpu_kernel void @rsq_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
index 47bd0d82b834..7f4c2cb19a32 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
@@ -4,7 +4,7 @@ declare float @llvm.amdgcn.rsq.legacy(float) #0
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32:
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_legacy_undef_f32:
 ; SI-NOT: v_rsq_legacy_f32
-define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float undef)
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index c644288977a3..0ce26d0fe876 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -6,7 +6,7 @@ declare double @llvm.amdgcn.rsq.f64(double) #0
 
 ; FUNC-LABEL: {{^}}rsq_f32:
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -15,7 +15,7 @@ define void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
-define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -23,7 +23,7 @@ define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@ define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_f64:
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -40,7 +40,7 @@ define void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f64_constant_4.0
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -50,7 +50,7 @@ define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -58,7 +58,7 @@ define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_undef_f32:
 ; SI-NOT: v_rsq_f32
-define void @rsq_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_undef_f32(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float undef)
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 132e476d5e29..5f8ca28ec5f0 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -1,10 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; GCN-LABEL: {{^}}test_barrier:
-; GCN: buffer_store_dword
-; GCN: s_waitcnt
+; GFX8: buffer_store_dword
+; GFX8: s_waitcnt
+; GFX9: flat_store_dword
+; GFX9-NOT: s_waitcnt
 ; GCN: s_barrier
-define void @test_barrier(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out) #0 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
index ecd4ac6824cc..b488565c6b3a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -9,7 +9,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; SI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0xc0,0xc7]
 ; VI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0x80,0xc0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_s_dcache_inv() #0 {
+define amdgpu_kernel void @test_s_dcache_inv() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
   ret void
 }
@@ -18,7 +18,7 @@ define void @test_s_dcache_inv() #0 {
 ; GCN-NEXT: ; BB#0:
 ; GCN: s_dcache_inv
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_inv_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
index 097f35d42c4f..a3a5c329f411 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -9,7 +9,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; CI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x40,0xc7]
 ; VI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x88,0xc0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_s_dcache_inv_vol() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_vol() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
   ret void
 }
@@ -18,7 +18,7 @@ define void @test_s_dcache_inv_vol() #0 {
 ; GCN-NEXT: ; BB#0:
 ; GCN-NEXT: s_dcache_inv_vol
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_inv_vol_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
index 9ecce7463f6b..909a85dda3e8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -7,7 +7,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xc0,0x00,0x00,0x00,0x00]
 ; VI-NEXT: s_endpgm
-define void @test_s_dcache_wb() #0 {
+define amdgpu_kernel void @test_s_dcache_wb() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
   ret void
 }
@@ -16,7 +16,7 @@ define void @test_s_dcache_wb() #0 {
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_wb_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
index 943f8c67a2e3..217bf97c41a4 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -7,7 +7,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol ; encoding: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00]
 ; VI-NEXT: s_endpgm
-define void @test_s_dcache_wb_vol() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_vol() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
   ret void
 }
@@ -16,7 +16,7 @@ define void @test_s_dcache_wb_vol() #0 {
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_wb_vol_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
index 72513fc86f49..8f64c50b9c60 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.decperflevel(i32) #0
 ; GCN: s_decperflevel 13{{$}}
 ; GCN: s_decperflevel 14{{$}}
 ; GCN: s_decperflevel 15{{$}}
-define void @test_s_decperflevel(i32 %x) #0 {
+define amdgpu_kernel void @test_s_decperflevel(i32 %x) #0 {
   call void @llvm.amdgcn.s.decperflevel(i32 0)
   call void @llvm.amdgcn.s.decperflevel(i32 1)
   call void @llvm.amdgcn.s.decperflevel(i32 2)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
index 4304398182a6..906a8a3e05f4 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
@@ -4,7 +4,7 @@
 
 ; GCN-LABEL: {{^}}s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574)
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
   store i32 %lds_size_bytes, i32 addrspace(1)* %out
@@ -14,7 +14,7 @@ define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size
 ; Call site has additional readnone knowledge.
 ; GCN-LABEL: {{^}}readnone_s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) #1
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
   store i32 %lds_size_bytes, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
index 2ae4fc473eaa..49e6e4257906 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.incperflevel(i32) #0
 ; GCN: s_incperflevel 13{{$}}
 ; GCN: s_incperflevel 14{{$}}
 ; GCN: s_incperflevel 15{{$}}
-define void @test_s_incperflevel(i32 %x) #0 {
+define amdgpu_kernel void @test_s_incperflevel(i32 %x) #0 {
   call void @llvm.amdgcn.s.incperflevel(i32 0)
   call void @llvm.amdgcn.s.incperflevel(i32 1)
   call void @llvm.amdgcn.s.incperflevel(i32 2)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
index d8eda10fdfd8..66041037168a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
@@ -10,7 +10,7 @@ declare i64 @llvm.amdgcn.s.memrealtime() #0
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: _store_dwordx2
-define void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memrealtime()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
index ff9d74619788..6aef769bafad 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
@@ -11,7 +11,7 @@ declare i64 @llvm.amdgcn.s.memtime() #0
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2
-define void @test_s_memtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memtime(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memtime()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
index 870aa48a3417..59c910c71c5a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.sleep(i32) #0
 ; GCN: s_sleep 13{{$}}
 ; GCN: s_sleep 14{{$}}
 ; GCN: s_sleep 15{{$}}
-define void @test_s_sleep(i32 %x) #0 {
+define amdgpu_kernel void @test_s_sleep(i32 %x) #0 {
   call void @llvm.amdgcn.s.sleep(i32 0)
   call void @llvm.amdgcn.s.sleep(i32 1)
   call void @llvm.amdgcn.s.sleep(i32 2)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
index 3aaed9d53772..2a3705de2b44 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.hi.u8(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8_non_immediate:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
index 5438571c5821..c404531513e7 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.u16(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_sad_u16:
 ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_sad_u16_non_immediate:
 ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
index 9422d7620ca6..1ee876aa724e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.u8(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_sad_u8:
 ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_sad_u8_non_immediate:
 ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
new file mode 100644
index 000000000000..593c95856811
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -0,0 +1,556 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
+; GCN: v_bfe_i32
+define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_arg_arg_imm:
+; GCN: v_bfe_i32
+define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_arg_imm_arg:
+; GCN: v_bfe_i32
+define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_imm_arg_arg:
+; GCN: v_bfe_i32
+define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_bfe_print_arg:
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
+define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 {
+  %load = load i32, i32 addrspace(1)* %src0, align 4
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_6:
+; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_7:
+; GCN-NOT: shl
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_8:
+; GCN: buffer_load_dword
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_9:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_10:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_11:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_12:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_13:
+; GCN: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_14:
+; GCN-NOT: lshr
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_0:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_1:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_2:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_3:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_4:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_5:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_6:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_7:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_8:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_9:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_10:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_11:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_12:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_13:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_14:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_15:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_16:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_17:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_18:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_sext_in_reg_i24:
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]],
+; GCN-NOT: v_lshl
+; GCN-NOT: v_ashr
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
+; GCN: buffer_store_dword [[BFE]],
+define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24)
+  %shl = shl i32 %bfe, 8
+  %ashr = ashr i32 %shl, 8
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: @simplify_demanded_bfe_sdiv
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
+; GCN: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
+; GCN: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
+; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
+; GCN: buffer_store_dword [[TMP2]]
+define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %src = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16)
+  %div = sdiv i32 %bfe, 2
+  store i32 %div, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_0_width:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 8, i32 0)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_8_bfe_8:
+; GCN: v_bfe_i32
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8)
+  %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8)
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_8_bfe_16:
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8)
+  %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 16)
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; This really should be folded into 1
+; GCN-LABEL: {{^}}bfe_16_bfe_8:
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 16)
+  %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8)
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure there isn't a redundant BFE
+; GCN-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
+; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 0, i32 8)
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 8, i32 0)
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sextload_i8_to_i32_bfe:
+; GCN: buffer_load_sbyte
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 0, i32 8)
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN: .text
+; GCN-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 8, i32 0)
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
+; GCN-NOT: shr
+; GCN-NOT: shl
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; GCN: s_endpgm
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
+; GCN: buffer_load_dword
+; GCN-NOT: shl
+; GCN-NOT: shr
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
+; GCN: s_endpgm
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
+; GCN: buffer_load_dword
+; GCN-NOT: v_lshl
+; GCN-NOT: v_ashr
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
+; GCN: s_endpgm
+define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 2)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
new file mode 100644
index 000000000000..94aeb077ebef
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
@@ -0,0 +1,127 @@
+;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}test_interrupt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_INTERRUPT)
+define amdgpu_kernel void @test_interrupt() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0);
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_emit:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define amdgpu_kernel void @test_gs_emit() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0);
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_cut:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
+define amdgpu_kernel void @test_gs_cut() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0);
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_emit_cut:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define amdgpu_kernel void @test_gs_emit_cut() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_done:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define amdgpu_kernel void @test_gs_done() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}sendmsg:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GCN-NEXT: s_endpgm
+define amdgpu_gs void @sendmsg(i32 inreg %a) #0 {
+  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a)
+  ret void
+}
+
+; GCN-LABEL: {{^}}sendmsghalt:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT)
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @sendmsghalt(i32 inreg %a) #0 {
+  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_interrupt_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_INTERRUPT)
+define amdgpu_kernel void @test_interrupt_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_emit_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define amdgpu_kernel void @test_gs_emit_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_cut_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1)
+define amdgpu_kernel void @test_gs_cut_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_emit_cut_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define amdgpu_kernel void @test_gs_emit_cut_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_done_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define amdgpu_kernel void @test_gs_done_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
+declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
index d453d03cded8..495e36b09f8f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
@@ -2,14 +2,13 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
-declare i32 @llvm.AMDGPU.flbit.i32(i32) #1
 
 ; FUNC-LABEL: {{^}}s_flbit:
 ; GCN: s_load_dword [[VAL:s[0-9]+]],
 ; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
-define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
   store i32 %r, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,36 +18,12 @@ define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
   store i32 %r, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}legacy_s_flbit:
-; GCN: s_load_dword [[VAL:s[0-9]+]],
-; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
-; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; GCN: buffer_store_dword [[VRESULT]],
-; GCN: s_endpgm
-define void @legacy_s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
-  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
-  store i32 %r, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}legacy_v_flbit:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define void @legacy_v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
-  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
-  store i32 %r, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
index fac0e352614c..4b930bfa210c 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.sin.f16(half %a)
 ; VI:  v_sin_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sin_f16(
+define amdgpu_kernel void @sin_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
index e3692fc5906c..0b7064da23f9 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.sin.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_sin_f32:
 ; GCN: v_sin_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
   %sin = call float @llvm.amdgcn.sin.f32(float %src) #0
   store float %sin, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index caac6ddbeb80..e0cec2134e70 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -9,7 +9,7 @@ declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone
@@ -22,7 +22,7 @@ define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)*
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+define amdgpu_kernel void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
new file mode 100644
index 000000000000..92e3a1099da0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -0,0 +1,623 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg:
+; GCN: v_bfe_u32
+define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_arg_arg_imm:
+; GCN: v_bfe_u32
+define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_arg_imm_arg:
+; GCN: v_bfe_u32
+define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_imm_arg_arg:
+; GCN: v_bfe_u32
+define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zextload_i8:
+; GCN: buffer_load_ubyte
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %load = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %load to i32
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: v_and_b32_e32
+; FIXME: Should be using s_add_i32
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i16:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: v_and_b32_e32
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 16)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN: bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 1, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
+; GCN-NEXT: bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 3, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
+; GCN-NEXT: bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 7, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 8, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_1:
+; GCN: buffer_load_dword
+; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_4:
+; GCN-NOT: lshl
+; GCN-NOT: shr
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = lshr i32 %shl, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_5:
+; GCN: buffer_load_dword
+; GCN-NOT: lshl
+; GCN-NOT: shr
+; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_6:
+; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_7:
+; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_8:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_9:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_10:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_11:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_12:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_13:
+; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_14:
+; GCN-NOT: lshr
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_0:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_1:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_2:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_3:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_4:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_5:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_6:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_7:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_8:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_9:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_10:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_11:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_12:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_13:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_14:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_15:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_16:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_17:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_18:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure that SimplifyDemandedBits doesn't cause the and to be
+; reduced to the bits demanded by the bfe.
+
+; XXX: The operand to v_bfe_u32 could also just directly be the load register.
+; GCN-LABEL: {{^}}simplify_bfe_u32_multi_use_arg:
+; GCN: buffer_load_dword [[ARG:v[0-9]+]]
+; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]]
+; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2
+; GCN-DAG: buffer_store_dword [[AND]]
+; GCN-DAG: buffer_store_dword [[BFE]]
+; GCN: s_endpgm
+define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+                                            i32 addrspace(1)* %out1,
+                                            i32 addrspace(1)* %in) #0 {
+  %src = load i32, i32 addrspace(1)* %in, align 4
+  %and = and i32 %src, 63
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %and, i32 2, i32 2)
+  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
+  store i32 %and, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_and:
+; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; GCN: buffer_store_dword
+define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
+  %b = lshr i32 %a, 6
+  %c = and i32 %b, 7
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_lshr_and:
+; GCN: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
+; GCN: buffer_store_dword
+define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %c = lshr i32 %a, %b
+  %d = and i32 %c, 7
+  store i32 %d, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_lshr:
+; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; GCN: buffer_store_dword
+define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+  %b = and i32 %a, 448
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_lshr2:
+; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; GCN: buffer_store_dword
+define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
+  %b = and i32 %a, 511
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_lshr:
+; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
+; GCN: buffer_store_dword
+define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+  %b = shl i32 %a, 9
+  %c = lshr i32 %b, 11
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
index e85179755371..e305f8eff587 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
@@ -4,7 +4,7 @@
 ; GCN-DAG: ; wave barrier
 ; GCN-NOT: s_barrier
 
-define void @test_wave_barrier() #0 {
+define amdgpu_kernel void @test_wave_barrier() #0 {
 entry:
   call void @llvm.amdgcn.wave.barrier() #1
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
index 58529b874442..349e7f0f0e8d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -34,7 +34,7 @@ declare i32 @llvm.amdgcn.workgroup.id.z() #0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -61,7 +61,7 @@ define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.y()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -96,7 +96,7 @@ define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.z()
   store i32 %id, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 1f18173f40a4..8b80998cab6f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -18,7 +18,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 
 ; ALL-NOT: v0
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v0
-define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@ define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
 
 ; ALL-NOT: v1
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v1
-define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
 
 ; ALL-NOT: v2
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v2
-define void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %id, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 112e29ed22a7..0604a49372a2 100644
--- a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -4,7 +4,7 @@
 declare half @llvm.ceil.f16(half %a)
 declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
 
-; GCN-LABEL: {{^}}ceil_f16
+; GCN-LABEL: {{^}}ceil_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_ceil_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]]
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
 ; VI:  v_ceil_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @ceil_f16(
+define amdgpu_kernel void @ceil_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -22,23 +22,27 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}ceil_v2f16
+; GCN-LABEL: {{^}}ceil_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_ceil_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG:  v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG:  v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @ceil_v2f16(
+define amdgpu_kernel void @ceil_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index ba354ed0b124..d836ea36ef63 100644
--- a/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -13,7 +13,7 @@ declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @cos_f16(
+define amdgpu_kernel void @cos_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,26 +25,34 @@ entry:
 
 ; GCN-LABEL: {{^}}cos_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
-; VI:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
-; GCN: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; SI:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
-; VI:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
-; GCN: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
-; GCN: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
-; GCN: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
+; SI-DAG:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
+
+; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
+; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
+
+; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+; GCN-DAG: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
+; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+
+; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; VI-DAG:  v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GCN-NOT: and
+
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @cos_v2f16(
+define amdgpu_kernel void @cos_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.cos.ll b/test/CodeGen/AMDGPU/llvm.cos.ll
index eb7dcbbf2346..bd89502d7b82 100644
--- a/test/CodeGen/AMDGPU/llvm.cos.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.ll
@@ -11,7 +11,7 @@
 ;SI: v_cos_f32
 ;SI-NOT: v_cos_f32
 
-define void @test(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %x) #1 {
    %cos = call float @llvm.cos.f32(float %x)
    store float %cos, float addrspace(1)* %out
    ret void
@@ -29,7 +29,7 @@ define void @test(float addrspace(1)* %out, float %x) #1 {
 ;SI: v_cos_f32
 ;SI-NOT: v_cos_f32
 
-define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
+define amdgpu_kernel void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
    %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
    store <4 x float> %cos, <4 x float> addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index 8b0854c2c2b5..c4a76de5989c 100644
--- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -9,7 +9,7 @@
 
 ; CHECK: buffer_store_dword
 ; CHECK: s_endpgm
-define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
+define amdgpu_kernel void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, i64 0, metadata !10, metadata !13), !dbg !14
   store i32 123, i32 addrspace(1)* %globalptr_arg, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
index 7fa56911efdc..5757142b9e95 100644
--- a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
 ; VI:  v_exp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @exp2_f16(
+define amdgpu_kernel void @exp2_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +25,24 @@ entry:
 ; GCN-LABEL: {{^}}exp2_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_exp_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_exp_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_exp_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @exp2_v2f16(
+define amdgpu_kernel void @exp2_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.ll b/test/CodeGen/AMDGPU/llvm.exp2.ll
index 42698925aae4..387dc3b8566a 100644
--- a/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ;FUNC-LABEL: {{^}}test:
@@ -11,7 +11,7 @@
 ;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;SI: v_exp_f32
 
-define void @test(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) {
 entry:
    %0 = call float @llvm.exp2.f32(float %in)
    store float %0, float addrspace(1)* %out
@@ -34,7 +34,7 @@ entry:
 ;SI: v_exp_f32
 ;SI: v_exp_f32
 
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -68,7 +68,7 @@ entry:
 ;SI: v_exp_f32
 ;SI: v_exp_f32
 ;SI: v_exp_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 60dfd734ee73..6a18141d8035 100644
--- a/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
 ; VI:  v_floor_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @floor_f16(
+define amdgpu_kernel void @floor_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +25,24 @@ entry:
 ; GCN-LABEL: {{^}}floor_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_floor_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @floor_v2f16(
+define amdgpu_kernel void @floor_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index 3431267e3943..518fe8baaa7a 100644
--- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -16,7 +16,7 @@ declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16(
+define amdgpu_kernel void @fma_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -32,7 +32,8 @@ define void @fma_f16(
 ; GCN-LABEL: {{^}}fma_f16_imm_a
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}}
+
+; SI:  v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
 ; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
@@ -41,7 +42,7 @@ define void @fma_f16(
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_a(
+define amdgpu_kernel void @fma_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -55,7 +56,7 @@ define void @fma_f16_imm_a(
 ; GCN-LABEL: {{^}}fma_f16_imm_b
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}}
+; SI:  v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
 ; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
@@ -64,7 +65,7 @@ define void @fma_f16_imm_a(
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_b(
+define amdgpu_kernel void @fma_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -78,7 +79,7 @@ define void @fma_f16_imm_b(
 ; GCN-LABEL: {{^}}fma_f16_imm_c
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x4200{{$}}
+; SI:  v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
@@ -87,7 +88,7 @@ define void @fma_f16_imm_b(
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_c(
+define amdgpu_kernel void @fma_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -102,27 +103,35 @@ define void @fma_f16_imm_c(
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
+; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
-; VI:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
+; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
+; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16(
+define amdgpu_kernel void @fma_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -135,29 +144,33 @@ define void @fma_v2f16(
   ret void
 }
 
-; GCN-LABEL: {{^}}fma_v2f16_imm_a
+; GCN-LABEL: {{^}}fma_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}}
+
+; SI:  v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
 ; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_F16]], v[[C_V2_F16]]
-; VI:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16]], v[[C_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
+; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
+; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_a(
+define amdgpu_kernel void @fma_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c) {
@@ -168,29 +181,39 @@ define void @fma_v2f16_imm_a(
   ret void
 }
 
-; GCN-LABEL: {{^}}fma_v2f16_imm_b
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}}
+; GCN-LABEL: {{^}}fma_v2f16_imm_b:
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+
+; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+
+; SI:  v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
-; VI:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
+; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
+; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_b(
+define amdgpu_kernel void @fma_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %c) {
@@ -201,29 +224,39 @@ define void @fma_v2f16_imm_b(
   ret void
 }
 
-; GCN-LABEL: {{^}}fma_v2f16_imm_c
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x4200{{$}}
+; GCN-LABEL: {{^}}fma_v2f16_imm_c:
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
+; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+
+; SI:  v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
-; VI:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+
+; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
+; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
+
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_c(
+define amdgpu_kernel void @fma_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 3bc85bdc29ef..f30fd1d58204 100644
--- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
-; RUN: llc -march=amdgcn -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
+; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
+; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
 
 declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
@@ -24,7 +24,7 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half>
 ; VI-DENORM: buffer_store_short [[RESULT]]
 
 ; GCN: s_endpgm
-define void @fmuladd_f16(
+define amdgpu_kernel void @fmuladd_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -40,10 +40,9 @@ define void @fmuladd_f16(
 ; GCN-LABEL: {{^}}fmuladd_f16_imm_a
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI:  v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
+; SI:  v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
 ; SI:  buffer_store_short v[[R_F16]]
 
@@ -55,7 +54,7 @@ define void @fmuladd_f16(
 ; VI-DENORM: buffer_store_short [[RESULT]]
 
 ; GCN: s_endpgm
-define void @fmuladd_f16_imm_a(
+define amdgpu_kernel void @fmuladd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -69,10 +68,9 @@ define void @fmuladd_f16_imm_a(
 ; GCN-LABEL: {{^}}fmuladd_f16_imm_b
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4200{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI:  v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]]
+; SI:  v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
 ; SI:  buffer_store_short v[[R_F16]]
 
@@ -85,7 +83,7 @@ define void @fmuladd_f16_imm_a(
 
 
 ; GCN: s_endpgm
-define void @fmuladd_f16_imm_b(
+define amdgpu_kernel void @fmuladd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -100,38 +98,45 @@ define void @fmuladd_f16_imm_b(
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
 ; SI:  v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[C_F32_0]]
 ; SI:  v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
-; SI:  v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
+; SI:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
 ; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; FIXME: and should be unnecessary
-; VI-FLUSH: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI-FLUSH: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
-; VI-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]]
-; VI-FLUSH: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
-
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
+; VI-FLUSH:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
+; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
+; VI-FLUSH-NOT: v_and_b32
+; VI-FLUSH:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
+
+; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DENORM: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
 ; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
 ; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
-; VI-DENORM: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[RES0]]
-; VI-DENORM: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
+; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
+; VI-DENORM-NOT: v_and_b32
+; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[RES0]]
 
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmuladd_v2f16(
+
+define amdgpu_kernel void @fmuladd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
index 8d1a8973cb4e..773eb55283e4 100644
--- a/test/CodeGen/AMDGPU/llvm.log2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.log2.v2f16(<2 x half> %a)
 ; VI:  v_log_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @log2_f16(
+define amdgpu_kernel void @log2_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -24,21 +24,25 @@ entry:
 
 ; GCN-LABEL: {{^}}log2_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT:  and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG:  v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG:  v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT:  and
+; VI:      v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @log2_v2f16(
+define amdgpu_kernel void @log2_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.log2.ll b/test/CodeGen/AMDGPU/llvm.log2.ll
index c75e7850b353..b9d593e43f32 100644
--- a/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -11,7 +11,7 @@
 ;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;SI: v_log_f32
 
-define void @test(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) {
 entry:
    %0 = call float @llvm.log2.f32(float %in)
    store float %0, float addrspace(1)* %out
@@ -34,7 +34,7 @@ entry:
 ;SI: v_log_f32
 ;SI: v_log_f32
 
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -68,7 +68,7 @@ entry:
 ;SI: v_log_f32
 ;SI: v_log_f32
 ;SI: v_log_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 8adc01b7b8c7..4c8dff52509a 100644
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -4,7 +4,7 @@
 declare half @llvm.maxnum.f16(half %a, half %b)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
 
-; GCN-LABEL: {{^}}maxnum_f16
+; GCN-LABEL: {{^}}maxnum_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
@@ -14,7 +14,7 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16(
+define amdgpu_kernel void @maxnum_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -26,16 +26,15 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_f16_imm_a
+; GCN-LABEL: {{^}}maxnum_f16_imm_a:
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
+; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16_imm_a(
+define amdgpu_kernel void @maxnum_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -45,16 +44,15 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_f16_imm_b
+; GCN-LABEL: {{^}}maxnum_f16_imm_b:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16_imm_b(
+define amdgpu_kernel void @maxnum_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -64,27 +62,33 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16
+; GCN-LABEL: {{^}}maxnum_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NOT: and
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16(
+define amdgpu_kernel void @maxnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -96,25 +100,24 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_a
+; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
+; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; VI:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16_imm_a(
+define amdgpu_kernel void @maxnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -124,25 +127,24 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_b
+; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
+; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-; VI:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16_imm_b(
+define amdgpu_kernel void @maxnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll
index 009338d273f5..7b4db55155eb 100644
--- a/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -80,7 +80,7 @@ declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 ; SI-DAG: ds_write_b8
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
@@ -125,7 +125,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %
 ; SI-DAG: ds_write_b16
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
@@ -144,7 +144,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %
 ; SI: ds_write2_b32
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
@@ -161,7 +161,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %
 ; SI: ds_write2_b64
 
 ; SI-DAG: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
@@ -238,7 +238,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %
 ; SI-DAG: buffer_store_byte
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
@@ -281,7 +281,7 @@ define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noa
 ; SI-DAG: buffer_store_short
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
@@ -294,7 +294,7 @@ define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noa
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
@@ -307,7 +307,7 @@ define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noa
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
@@ -320,7 +320,7 @@ define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noa
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
@@ -340,7 +340,7 @@ define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* no
 ; SI-DAG: s_load_dwordx2
 ; SI-DAG: buffer_store_dwordx4
 ; SI-DAG: buffer_store_dwordx4
-define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
+define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
   %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)*
   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 4, i1 false)
   ret void
@@ -365,7 +365,7 @@ define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) noun
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
-define void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
+define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
   %str = bitcast [16 x i8] addrspace(2)* @hello.align1 to i8 addrspace(2)*
   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 4cc1deb2095c..b8221356b664 100644
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -4,7 +4,7 @@
 declare half @llvm.minnum.f16(half %a, half %b)
 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
 
-; GCN-LABEL: {{^}}minnum_f16
+; GCN-LABEL: {{^}}minnum_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
@@ -14,7 +14,7 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16(
+define amdgpu_kernel void @minnum_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -26,16 +26,15 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_f16_imm_a
+; GCN-LABEL: {{^}}minnum_f16_imm_a:
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
+; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16_imm_a(
+define amdgpu_kernel void @minnum_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -45,16 +44,15 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_f16_imm_b
+; GCN-LABEL: {{^}}minnum_f16_imm_b:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16_imm_b(
+define amdgpu_kernel void @minnum_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -64,27 +62,32 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16
+; GCN-LABEL: {{^}}minnum_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NOT: and
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16(
+define amdgpu_kernel void @minnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -96,25 +99,27 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16_imm_a
+; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}}
+
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; VI:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
+; SI-DAG:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
+; VI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DAG:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16_imm_a(
+define amdgpu_kernel void @minnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -124,25 +129,24 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16_imm_b
+; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}}
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
+; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-; VI:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16_imm_b(
+define amdgpu_kernel void @minnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll b/test/CodeGen/AMDGPU/llvm.r600.cube.ll
index 78b88122229b..b5a0de95acf5 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.cube.ll
@@ -22,7 +22,7 @@ main_body:
   %tmp12 = insertelement <4 x float> %tmp11, float %tmp7, i32 1
   %tmp13 = insertelement <4 x float> %tmp12, float %tmp10, i32 2
   %tmp14 = insertelement <4 x float> %tmp13, float 1.000000e+00, i32 3
-  %tmp15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp14)
+  %tmp15 = call <4 x float> @llvm.r600.cube(<4 x float> %tmp14)
   %tmp16 = extractelement <4 x float> %tmp15, i32 0
   %tmp17 = extractelement <4 x float> %tmp15, i32 1
   %tmp18 = extractelement <4 x float> %tmp15, i32 2
@@ -44,7 +44,7 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
+declare <4 x float> @llvm.r600.cube(<4 x float>) #0
 
 ; Function Attrs: nounwind readnone
 declare float @llvm.fabs.f32(float) #0
diff --git a/test/CodeGen/AMDGPU/llvm.r600.dot4.ll b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
index 4db29c58385e..de8a47741c94 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
@@ -2,7 +2,7 @@
 
 declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone
 
-define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
+define amdgpu_kernel void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
   %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
   %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
   %dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
index e4e6dd8e1069..93caafbb9524 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
@@ -2,7 +2,7 @@
 
 ; EG-LABEL: {{^}}test_group_barrier:
 ; EG: GROUP_BARRIER
-define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_group_barrier(i32 addrspace(1)* %out) #0 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x()
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index a5b07e072fa5..82c42601ef1e 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -14,7 +14,7 @@
 
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -44,7 +44,7 @@ entry:
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -59,7 +59,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xy(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %y = call i32 @llvm.r600.read.local.size.y() #0
@@ -78,7 +78,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xz(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %z = call i32 @llvm.r600.read.local.size.z() #0
@@ -98,7 +98,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_yz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_yz(i32 addrspace(1)* %out) {
 entry:
   %y = call i32 @llvm.r600.read.local.size.y() #0
   %z = call i32 @llvm.r600.read.local.size.z() #0
@@ -121,7 +121,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xyz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xyz(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %y = call i32 @llvm.r600.read.local.size.y() #0
@@ -138,7 +138,7 @@ entry:
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.x() #0
   %shl = shl i32 %size, 16
@@ -153,7 +153,7 @@ entry:
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.y() #0
   %shl = shl i32 %size, 16
@@ -168,7 +168,7 @@ entry:
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.z() #0
   %shl = shl i32 %size, 16
diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
index 1c6e7950e9b7..90d076d4fb4d 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
@@ -4,7 +4,7 @@ declare float @llvm.r600.recipsqrt.clamped.f32(float) nounwind readnone
 
 ; EG-LABEL: {{^}}rsq_clamped_f32:
 ; EG: RECIPSQRT_CLAMPED
-define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
+define amdgpu_kernel void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq_clamped = call float @llvm.r600.recipsqrt.clamped.f32(float %src)
   store float %rsq_clamped, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
index 1d6bff01e662..d9177b39b8ac 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
@@ -4,7 +4,7 @@ declare float @llvm.r600.recipsqrt.ieee.f32(float) nounwind readnone
 
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32:
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float %src) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
 ; TODO: Really these should be constant folded
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_4.0
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 4.0) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind
 
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_100.0
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 100.0) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.r600.tex.ll b/test/CodeGen/AMDGPU/llvm.r600.tex.ll
index 409037f3e976..98044917e2b0 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.tex.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.tex.ll
@@ -17,7 +17,7 @@
 ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
 ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
 
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
 bb:
   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
   %tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 3657940f36fd..59e81a7acc0b 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
 
 declare half @llvm.rint.f16(half %a)
 declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
@@ -9,10 +10,10 @@ declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_rndne_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
+; GFX89: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rint_f16(
+define amdgpu_kernel void @rint_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +26,30 @@ entry:
 ; GCN-LABEL: {{^}}rint_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: v_and_b32
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: v_and_b32
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
+; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; GFX9: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
+; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
+; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @rint_v2f16(
+define amdgpu_kernel void @rint_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/test/CodeGen/AMDGPU/llvm.rint.f64.ll
index c63fb1727940..30ce8ed83ff1 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.f64.ll
@@ -11,7 +11,7 @@
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
 ; SI: s_endpgm
-define void @rint_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @rint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.rint.f64(double %in)
   store double %0, double addrspace(1)* %out
@@ -21,7 +21,7 @@ entry:
 ; FUNC-LABEL: {{^}}rint_v2f64:
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
-define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
@@ -33,7 +33,7 @@ entry:
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
-define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.rint.ll b/test/CodeGen/AMDGPU/llvm.rint.ll
index cf7c0e4c6fb6..4056bc39448d 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.ll
@@ -6,7 +6,7 @@
 ; R600: RNDNE
 
 ; SI: v_rndne_f32_e32
-define void @rint_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @rint_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.rint.f32(float %in) #0
   store float %0, float addrspace(1)* %out
@@ -19,7 +19,7 @@ entry:
 
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
-define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -36,7 +36,7 @@ entry:
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
-define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 3ea4551f0ee7..c58b9b4d9e94 100644
--- a/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -2,7 +2,7 @@
 
 ; FUNC-LABEL: {{^}}round_f64:
 ; SI: s_endpgm
-define void @round_f64(double addrspace(1)* %out, double %x) #0 {
+define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
   %result = call double @llvm.round.f64(double %x) #1
   store double %result, double addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
 
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
@@ -38,7 +38,7 @@ define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0
 
 ; FUNC-LABEL: {{^}}round_v2f64:
 ; SI: s_endpgm
-define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
+define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
   %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
 
 ; FUNC-LABEL: {{^}}round_v4f64:
 ; SI: s_endpgm
-define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
+define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
   %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@ define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
 
 ; FUNC-LABEL: {{^}}round_v8f64:
 ; SI: s_endpgm
-define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
+define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
   %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
   store <8 x double> %result, <8 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
index 7e8f8ff172e8..ffe87977870b 100644
--- a/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -1,18 +1,19 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}round_f32:
-; SI-DAG: s_load_dword [[SX:s[0-9]+]]
-; SI-DAG: s_brev_b32 [[K:s[0-9]+]], -2{{$}}
-; SI-DAG: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
-; SI-DAG: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
-; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
-; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
-; SI: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5
-; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-DAG: s_load_dword [[SX:s[0-9]+]]
+; GCN-DAG: s_brev_b32 [[K:s[0-9]+]], -2{{$}}
+; GCN-DAG: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; GCN-DAG: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; GCN-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; GCN: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
+; GCN: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5
+; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; GCN: buffer_store_dword [[RESULT]]
 
 ; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
 ; R600-DAG: ADD  {{.*}},
@@ -20,7 +21,7 @@
 ; R600-DAG: SETGE
 ; R600-DAG: CNDE
 ; R600-DAG: ADD
-define void @round_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @round_f32(float addrspace(1)* %out, float %x) #0 {
   %result = call float @llvm.round.f32(float %x) #1
   store float %result, float addrspace(1)* %out
   ret void
@@ -32,36 +33,78 @@ define void @round_f32(float addrspace(1)* %out, float %x) #0 {
 ; compiler doesn't crash.
 
 ; FUNC-LABEL: {{^}}round_v2f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 ; R600: CF_END
-define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
+define amdgpu_kernel void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
   %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}round_v4f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 ; R600: CF_END
-define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
+define amdgpu_kernel void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
   %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}round_v8f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 ; R600: CF_END
-define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
+define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
   %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
   store <8 x float> %result, <8 x float> addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}round_f16:
+; GFX89-DAG: s_load_dword [[SX:s[0-9]+]]
+; GFX89-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7fff{{$}}
+; GFX89-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; GFX89-DAG: v_mov_b32_e32 [[BFI_K:v[0-9]+]], 0x3c00
+; GFX89: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], [[BFI_K]], [[VX]]
+
+; GFX89: v_trunc_f16_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; GFX89: v_sub_f16_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; GFX89: v_cmp_ge_f16_e64 vcc, |[[SUB]]|, 0.5
+; GFX89: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[COPYSIGN]]
+; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; GFX89: buffer_store_short [[RESULT]]
+define amdgpu_kernel void @round_f16(half addrspace(1)* %out, i32 %x.arg) #0 {
+  %x.arg.trunc = trunc i32 %x.arg to i16
+  %x = bitcast i16 %x.arg.trunc to half
+  %result = call half @llvm.round.f16(half %x) #1
+  store half %result, half addrspace(1)* %out
+  ret void
+}
+
+; Should be scalarized
+; FUNC-LABEL: {{^}}round_v2f16:
+; GFX89-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7fff{{$}}
+; GFX89-DAG: v_mov_b32_e32 [[BFI_K:v[0-9]+]], 0x3c00
+; GFX89: v_bfi_b32 [[COPYSIGN0:v[0-9]+]], [[K]], [[BFI_K]],
+; GFX89: v_bfi_b32 [[COPYSIGN1:v[0-9]+]], [[K]], [[BFI_K]],
+
+; GFX9: v_and_b32_e32
+; GFX9: v_lshl_or_b32
+define amdgpu_kernel void @round_v2f16(<2 x half> addrspace(1)* %out, i32 %in.arg) #0 {
+  %in = bitcast i32 %in.arg to <2 x half>
+  %result = call <2 x half> @llvm.round.v2f16(<2 x half> %in)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  ret void
+}
+
 declare float @llvm.round.f32(float) #1
 declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
 declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
 declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
 
+declare half @llvm.round.f16(half) #1
+declare <2 x half> @llvm.round.v2f16(<2 x half>) #1
+declare <4 x half> @llvm.round.v4f16(<4 x half>) #1
+declare <8 x half> @llvm.round.v8f16(<8 x half>) #1
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index b01932f69b06..eb1f32c981f8 100644
--- a/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -13,7 +13,7 @@ declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sin_f16(
+define amdgpu_kernel void @sin_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -26,25 +26,35 @@ entry:
 ; GCN-LABEL: {{^}}sin_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
-; VI:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
-; GCN: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; SI:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
-; VI:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
-; GCN: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
-; GCN: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
-; GCN: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
+; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
+; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+
+; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
+; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+
+; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
+; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @sin_v2f16(
+define amdgpu_kernel void @sin_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.sin.ll b/test/CodeGen/AMDGPU/llvm.sin.ll
index 04754396a0f7..2a17303267ba 100644
--- a/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -12,7 +12,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @sin_f32(float addrspace(1)* %out, float %x) #1 {
    %sin = call float @llvm.sin.f32(float %x)
    store float %sin, float addrspace(1)* %out
    ret void
@@ -24,7 +24,7 @@ define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
   %y = fmul float 3.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
   %y = fmul float 3.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
   %y = fmul float 2.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
   %y = fmul float 2.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
    %y = fmul float 2.0, %x
    %sin = call float @llvm.sin.f32(float %y)
    store float %sin, float addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
    %y = fmul float 2.0, %x
    %sin = call float @llvm.sin.f32(float %y)
    store float %sin, float addrspace(1)* %out
@@ -110,7 +110,7 @@ define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
 ; SI: v_sin_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+define amdgpu_kernel void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
    %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
    store <4 x float> %sin, <4 x float> addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 69125b0bcfdc..46ee6526aca2 100644
--- a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
 ; VI:  v_sqrt_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sqrt_f16(
+define amdgpu_kernel void @sqrt_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +25,24 @@ entry:
 ; GCN-LABEL: {{^}}sqrt_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_sqrt_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_sqrt_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_sqrt_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: v_and_b32
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: v_and_b32
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @sqrt_v2f16(
+define amdgpu_kernel void @sqrt_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 9f84b432209d..dc7182aa0d89 100644
--- a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
 ; VI:  v_trunc_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @trunc_f16(
+define amdgpu_kernel void @trunc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +25,24 @@ entry:
 ; GCN-LABEL: {{^}}trunc_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_trunc_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: v_and_b32
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: v_and_b32
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @trunc_v2f16(
+define amdgpu_kernel void @trunc_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/load-constant-f64.ll b/test/CodeGen/AMDGPU/load-constant-f64.ll
index 1b42a9e96e01..0050d1a4f874 100644
--- a/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -6,7 +6,7 @@
 ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_store_dwordx2
-define void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
   %ld = load double, double addrspace(2)* %in
   store double %ld, double addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/load-constant-i1.ll b/test/CodeGen/AMDGPU/load-constant-i1.ll
index 104af10036c1..c8abe5c77ee5 100644
--- a/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -9,56 +9,56 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %load = load i1, i1 addrspace(2)* %in
   store i1 %load, i1 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v2i1:
-define void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   store <2 x i1> %load, <2 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v3i1:
-define void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   store <3 x i1> %load, <3 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v4i1:
-define void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   store <4 x i1> %load, <4 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v8i1:
-define void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   store <8 x i1> %load, <8 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v16i1:
-define void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   store <16 x i1> %load, <16 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v32i1:
-define void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   store <32 x i1> %load, <32 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v64i1:
-define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   store <64 x i1> %load, <64 x i1> addrspace(1)* %out
   ret void
@@ -67,7 +67,7 @@ define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspa
 ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32:
-define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32:
-define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -105,7 +105,7 @@ define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32:
-define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -113,7 +113,7 @@ define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32:
-define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -121,7 +121,7 @@ define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32:
-define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32:
-define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32:
-define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -145,7 +145,7 @@ define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32:
-define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32:
-define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -161,7 +161,7 @@ define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32:
-define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -169,7 +169,7 @@ define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32:
-define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32:
-define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -185,7 +185,7 @@ define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32:
-define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -193,7 +193,7 @@ define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32:
-define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -201,7 +201,7 @@ define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32:
-define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -209,7 +209,7 @@ define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32:
-define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -221,7 +221,7 @@ define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]
 ; GCN: buffer_store_dwordx2
-define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -233,7 +233,7 @@ define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -241,7 +241,7 @@ define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64:
-define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64:
-define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -257,7 +257,7 @@ define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64:
-define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -265,7 +265,7 @@ define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64:
-define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -273,7 +273,7 @@ define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64:
-define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -281,7 +281,7 @@ define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64:
-define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -289,7 +289,7 @@ define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64:
-define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -297,7 +297,7 @@ define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64:
-define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -305,7 +305,7 @@ define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64:
-define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -313,7 +313,7 @@ define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64:
-define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -321,7 +321,7 @@ define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64:
-define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -329,7 +329,7 @@ define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64:
-define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -337,7 +337,7 @@ define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64:
-define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -345,7 +345,7 @@ define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64:
-define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -353,7 +353,7 @@ define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64:
-define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -361,7 +361,7 @@ define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <6
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64:
-define void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll
index f7be1291040f..5dd2efdf6382 100644
--- a/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -8,7 +8,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %ld = load i16, i16 addrspace(2)* %in
   store i16 %ld, i16 addrspace(1)* %out
@@ -19,7 +19,7 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
@@ -31,7 +31,7 @@ entry:
 
 ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
-define void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
@@ -53,7 +53,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
@@ -80,7 +80,7 @@ entry:
 ; GCN-HSA: flat_store_dword
 
 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -109,7 +109,7 @@ define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(
 ; GCN-HSA: flat_load_ushort
 
 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -123,7 +123,7 @@ define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -140,7 +140,7 @@ define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
 ; EG: 16
 ; EG: 16
-define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -160,7 +160,7 @@ define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -183,7 +183,7 @@ define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -204,7 +204,7 @@ entry:
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -229,7 +229,7 @@ entry:
 ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -254,7 +254,7 @@ define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -288,7 +288,7 @@ define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 65535
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -322,7 +322,7 @@ define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -337,7 +337,7 @@ define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1
-define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -352,7 +352,7 @@ define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1
-define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -369,7 +369,7 @@ define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -385,7 +385,7 @@ define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -404,7 +404,7 @@ define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -421,7 +421,7 @@ define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -438,7 +438,7 @@ define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -464,7 +464,7 @@ define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -475,7 +475,7 @@ define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -488,7 +488,7 @@ define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -498,7 +498,7 @@ define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -508,7 +508,7 @@ define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -518,7 +518,7 @@ define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -528,7 +528,7 @@ define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -538,7 +538,7 @@ define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -548,7 +548,7 @@ define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -559,7 +559,7 @@ define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -570,7 +570,7 @@ define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -583,7 +583,7 @@ define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -596,7 +596,7 @@ define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -606,7 +606,7 @@ define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; These trigger undefined register machine verifier errors
 
 ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64:
-; define void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -614,7 +614,7 @@ define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; }
 
 ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64:
-; define void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-constant-i32.ll b/test/CodeGen/AMDGPU/load-constant-i32.ll
index d1ff1c706c40..7370d45ca6b9 100644
--- a/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -7,7 +7,7 @@
 ; GCN: s_load_dword s{{[0-9]+}}
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(2)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64
-define void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -40,7 +40,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -52,7 +52,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
@@ -66,7 +66,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -81,7 +81,7 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG: CF_END
 ; EG: VTX_READ_32
-define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
   %ld = load i32, i32 addrspace(2)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -98,7 +98,7 @@ define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
   %ld = load i32, i32 addrspace(2)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -108,7 +108,7 @@ define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(
 ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
 ; GCN: s_load_dword
 ; GCN: store_dwordx2
-define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -119,7 +119,7 @@ define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; GCN: s_load_dword s[[LO:[0-9]+]]
 ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
 ; GCN: store_dwordx2
-define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: store_dwordx4
-define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -143,7 +143,7 @@ define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; GCN-DAG: s_ashr_i32
 
 ; GCN: store_dwordx4
-define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -155,7 +155,7 @@ define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -172,7 +172,7 @@ define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -191,7 +191,7 @@ define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-SA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -219,7 +219,7 @@ define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -240,7 +240,7 @@ define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -267,7 +267,7 @@ define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -319,7 +319,7 @@ define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 
-define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -370,7 +370,7 @@ define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-constant-i64.ll b/test/CodeGen/AMDGPU/load-constant-i64.ll
index 0d071a10b49a..14e50ea4c3ca 100644
--- a/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -7,7 +7,7 @@
 ; FUNC-LABEL: {{^}}constant_load_i64:
 ; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; EG: VTX_READ_64
-define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
   %ld = load i64, i64 addrspace(2)* %in
   store i64 %ld, i64 addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@ define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 
 ; EG-DAG: VTX_READ_128
 ; EG-DAG: VTX_READ_128
-define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
@@ -41,7 +41,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
@@ -55,7 +55,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
@@ -74,7 +74,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-constant-i8.ll b/test/CodeGen/AMDGPU/load-constant-i8.ll
index 9fdc4ebfd854..6e56b9f9b6d6 100644
--- a/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -10,7 +10,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(2)* %in
   store i8 %ld, i8 addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
@@ -33,7 +33,7 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
@@ -44,7 +44,7 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
@@ -55,7 +55,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
@@ -66,7 +66,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
@@ -78,7 +78,7 @@ entry:
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %ld = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -102,7 +102,7 @@ define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -114,7 +114,7 @@ define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; TODO: This should use DST, but for some there are redundant MOVs
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG: 8
-define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -150,7 +150,7 @@ define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -170,7 +170,7 @@ define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -193,7 +193,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -214,7 +214,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -236,7 +236,7 @@ define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -264,7 +264,7 @@ define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -294,7 +294,7 @@ define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -335,7 +335,7 @@ define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -378,7 +378,7 @@ define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -450,7 +450,7 @@ define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -526,7 +526,7 @@ define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -539,7 +539,7 @@ define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -552,7 +552,7 @@ define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -570,7 +570,7 @@ define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -589,7 +589,7 @@ define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -600,7 +600,7 @@ define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -613,7 +613,7 @@ define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -623,7 +623,7 @@ define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -633,7 +633,7 @@ define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -643,7 +643,7 @@ define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -653,7 +653,7 @@ define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -663,7 +663,7 @@ define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -673,7 +673,7 @@ define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -683,7 +683,7 @@ define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -693,7 +693,7 @@ define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 ; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -704,7 +704,7 @@ define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -715,7 +715,7 @@ define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -723,7 +723,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64:
-; define void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -731,7 +731,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64:
-; define void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -744,7 +744,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 
 ; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
-define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -759,7 +759,7 @@ define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -767,7 +767,7 @@ define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16:
-define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -778,7 +778,7 @@ define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -788,7 +788,7 @@ define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -800,7 +800,7 @@ define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -810,7 +810,7 @@ define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -824,7 +824,7 @@ define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -834,7 +834,7 @@ define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -853,7 +853,7 @@ define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 
-define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -863,7 +863,7 @@ define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -889,7 +889,7 @@ define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -900,7 +900,7 @@ define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <1
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -943,7 +943,7 @@ define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -951,7 +951,7 @@ define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16:
-; define void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
@@ -959,7 +959,7 @@ define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16:
-; define void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll
index 805c0a7a39c7..bd6fea587b42 100644
--- a/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -10,7 +10,7 @@
 ; GCN-HSA: flat_load_dword
 
 ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load float, float addrspace(1)* %in
   store float %tmp0, float addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; R600: VTX_READ_64
-define void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
   store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -34,7 +34,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; R600: VTX_READ_128
-define void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
   store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
@@ -46,7 +46,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; R600: VTX_READ_128
-define void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
   store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
@@ -61,7 +61,7 @@ entry:
 
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
   store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
@@ -83,7 +83,7 @@ entry:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
   store <16 x float> %tmp0, <16 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll
index dc1a9432283e..5b772e1fe5ee 100644
--- a/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -8,7 +8,7 @@
 
 ; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %ld = load double, double addrspace(1)* %in
   store double %ld, double addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@ define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in
 ; FUNC-LABEL: {{^}}global_load_v2f64:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x double>, <2 x double> addrspace(1)* %in
   store <2 x double> %ld, <2 x double> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x double>, <3 x double> addrspace(1)* %in
   store <3 x double> %ld, <3 x double> addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x double>, <4 x double> addrspace(1)* %in
   store <4 x double> %ld, <4 x double> addrspace(1)* %out
@@ -59,7 +59,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x double>, <8 x double> addrspace(1)* %in
   store <8 x double> %ld, <8 x double> addrspace(1)* %out
@@ -84,7 +84,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x double>, <16 x double> addrspace(1)* %in
   store <16 x double> %ld, <16 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i1.ll b/test/CodeGen/AMDGPU/load-global-i1.ll
index e2e90cac8cc1..cb3536a0c128 100644
--- a/test/CodeGen/AMDGPU/load-global-i1.ll
+++ b/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -9,56 +9,56 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %load = load i1, i1 addrspace(1)* %in
   store i1 %load, i1 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v2i1:
-define void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   store <2 x i1> %load, <2 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v3i1:
-define void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   store <3 x i1> %load, <3 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v4i1:
-define void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   store <4 x i1> %load, <4 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v8i1:
-define void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   store <8 x i1> %load, <8 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v16i1:
-define void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   store <16 x i1> %load, <16 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v32i1:
-define void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   store <32 x i1> %load, <32 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v64i1:
-define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   store <64 x i1> %load, <64 x i1> addrspace(1)* %out
   ret void
@@ -67,7 +67,7 @@ define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace
 ; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i32:
-define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i32:
-define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -105,7 +105,7 @@ define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i32:
-define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -113,7 +113,7 @@ define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i32:
-define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -121,7 +121,7 @@ define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i32:
-define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i32:
-define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i32:
-define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -145,7 +145,7 @@ define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i32:
-define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i32:
-define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -161,7 +161,7 @@ define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i32:
-define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -169,7 +169,7 @@ define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i32:
-define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i32:
-define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -185,7 +185,7 @@ define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i32:
-define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -193,7 +193,7 @@ define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i32:
-define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -201,7 +201,7 @@ define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i32:
-define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -209,7 +209,7 @@ define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i32:
-define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -221,7 +221,7 @@ define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]{{$}}
 ; GCN: buffer_store_dwordx2
-define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -233,7 +233,7 @@ define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -241,7 +241,7 @@ define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i64:
-define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i64:
-define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -257,7 +257,7 @@ define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i64:
-define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -265,7 +265,7 @@ define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i64:
-define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -273,7 +273,7 @@ define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i64:
-define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -281,7 +281,7 @@ define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i64:
-define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -289,7 +289,7 @@ define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i64:
-define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -297,7 +297,7 @@ define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i64:
-define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -305,7 +305,7 @@ define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i64:
-define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -313,7 +313,7 @@ define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i64:
-define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -321,7 +321,7 @@ define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i64:
-define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -329,7 +329,7 @@ define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i64:
-define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -337,7 +337,7 @@ define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i64:
-define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -345,7 +345,7 @@ define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i64:
-define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -353,7 +353,7 @@ define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i64:
-define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -361,7 +361,7 @@ define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i64:
-define void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
index 88d6b7b99d30..dcdd1a947cd4 100644
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -11,7 +11,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %ld = load i16, i16 addrspace(1)* %in
   store i16 %ld, i16 addrspace(1)* %out
@@ -23,7 +23,7 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
@@ -36,7 +36,7 @@ entry:
 
 ; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
-define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
@@ -48,7 +48,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
@@ -60,7 +60,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
@@ -76,7 +76,7 @@ entry:
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
@@ -91,7 +91,7 @@ entry:
 ; GCN-HSA: flat_store_dword
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -108,7 +108,7 @@ define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EGCM: 16
-define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -120,7 +120,7 @@ define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; GCN-HSA: flat_load_ushort
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -134,7 +134,7 @@ define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EGCM: 16
-define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -148,7 +148,7 @@ define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
 ; EGCM: 16
-define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -168,7 +168,7 @@ define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -190,7 +190,7 @@ define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; EGCM: 16
 ; EGCM: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
 ; EGCM: AND_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], literal
-define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -214,7 +214,7 @@ entry:
 ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -237,7 +237,7 @@ entry:
 ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal
 ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal
 ; EGCM-DAG: 16
-define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -262,7 +262,7 @@ define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -296,7 +296,7 @@ define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -330,7 +330,7 @@ define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -346,7 +346,7 @@ define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -357,7 +357,7 @@ define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -379,7 +379,7 @@ define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -401,7 +401,7 @@ define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -435,7 +435,7 @@ define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
-define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -452,7 +452,7 @@ define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
-define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -469,7 +469,7 @@ define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM: MOV {{.*}}, 0.0
-define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -495,7 +495,7 @@ define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EGCM: 31
-define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -506,7 +506,7 @@ define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM: MOV {{.*}}, 0.0
-define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -519,7 +519,7 @@ define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EGCM: 31
-define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -527,7 +527,7 @@ define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i64:
-define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -537,7 +537,7 @@ define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64:
 
 ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -547,7 +547,7 @@ define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64:
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -557,7 +557,7 @@ define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64:
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -567,7 +567,7 @@ define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64:
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -577,7 +577,7 @@ define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64:
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -588,7 +588,7 @@ define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -599,7 +599,7 @@ define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -612,7 +612,7 @@ define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -625,7 +625,7 @@ define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -633,7 +633,7 @@ define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; ; XFUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i64:
-; define void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -641,7 +641,7 @@ define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; }
 
 ; ; XFUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i64:
-; define void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll
index e3335347a63f..5df32c1e3120 100644
--- a/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -9,7 +9,7 @@
 ; GCN-HSA: flat_load_dword
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(1)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -21,7 +21,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EG: VTX_READ_64
-define void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -33,7 +33,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -45,7 +45,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -60,7 +60,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
@@ -82,7 +82,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -98,7 +98,7 @@ entry:
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -117,7 +117,7 @@ define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -130,7 +130,7 @@ define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 
 ; GCN-HSA: flat_load_dword
 ; GCN-HSA: flat_store_dwordx2
-define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -143,7 +143,7 @@ define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -156,7 +156,7 @@ define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; GCN-HSA: flat_load_dwordx2
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -172,7 +172,7 @@ define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -187,7 +187,7 @@ define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -208,7 +208,7 @@ define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -231,7 +231,7 @@ define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-SA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -263,7 +263,7 @@ define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -309,7 +309,7 @@ define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; GCN-DAG: v_ashrrev_i32
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -344,7 +344,7 @@ define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -444,7 +444,7 @@ define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 
-define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -511,7 +511,7 @@ define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll
index dd4ce2c10ebd..de16b6c8997e 100644
--- a/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -13,7 +13,7 @@
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
 
 ; EG: VTX_READ_64
-define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %ld = load i64, i64 addrspace(1)* %in
   store i64 %ld, i64 addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
@@ -40,7 +40,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
@@ -56,7 +56,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(1)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
@@ -78,7 +78,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(1)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
@@ -112,7 +112,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(1)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll
index c880700f347b..71adf090532f 100644
--- a/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -11,7 +11,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(1)* %in
   store i8 %ld, i8 addrspace(1)* %out
@@ -23,7 +23,7 @@ entry:
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
@@ -35,7 +35,7 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
@@ -59,7 +59,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(1)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
@@ -72,7 +72,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(1)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
@@ -84,7 +84,7 @@ entry:
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -98,7 +98,7 @@ define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)*
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %ld = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -108,7 +108,7 @@ define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)*
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -120,7 +120,7 @@ define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -135,7 +135,7 @@ define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8
 ; TODO: These should use DST, but for some there are redundant MOVs
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
 ; EG-DAG: 8
-define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -174,7 +174,7 @@ define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -207,7 +207,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -227,7 +227,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -248,7 +248,7 @@ define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -273,7 +273,7 @@ define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -300,7 +300,7 @@ define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -341,7 +341,7 @@ define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -384,7 +384,7 @@ define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -456,7 +456,7 @@ define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -532,7 +532,7 @@ define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -545,7 +545,7 @@ define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -558,7 +558,7 @@ define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -576,7 +576,7 @@ define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -595,7 +595,7 @@ define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -606,7 +606,7 @@ define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -619,7 +619,7 @@ define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -629,7 +629,7 @@ define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -639,7 +639,7 @@ define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -649,7 +649,7 @@ define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -659,7 +659,7 @@ define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -669,7 +669,7 @@ define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -679,7 +679,7 @@ define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -689,7 +689,7 @@ define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -699,7 +699,7 @@ define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -710,7 +710,7 @@ define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -721,7 +721,7 @@ define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -729,7 +729,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64:
-; define void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -737,7 +737,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64:
-; define void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -752,7 +752,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -768,7 +768,7 @@ define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)*
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -778,7 +778,7 @@ define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)*
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i16:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -789,7 +789,7 @@ define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -799,7 +799,7 @@ define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -811,7 +811,7 @@ define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -821,7 +821,7 @@ define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -835,7 +835,7 @@ define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -845,7 +845,7 @@ define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -863,7 +863,7 @@ define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -873,7 +873,7 @@ define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -899,7 +899,7 @@ define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -910,7 +910,7 @@ define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -953,7 +953,7 @@ define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -961,7 +961,7 @@ define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16:
-; define void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
@@ -969,7 +969,7 @@ define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16:
-; define void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-input-fold.ll b/test/CodeGen/AMDGPU/load-input-fold.ll
index b1899a45bf56..0724e09d7ad0 100644
--- a/test/CodeGen/AMDGPU/load-input-fold.ll
+++ b/test/CodeGen/AMDGPU/load-input-fold.ll
@@ -97,15 +97,6 @@ main_body:
 ; Function Attrs: readnone
 declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
 
-; Function Attrs: readonly
-declare float @fabs(float) #2
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq(float) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
-
 ; Function Attrs: nounwind readonly
 declare float @llvm.pow.f32(float, float) #3
 
diff --git a/test/CodeGen/AMDGPU/load-local-f32.ll b/test/CodeGen/AMDGPU/load-local-f32.ll
index 77b5e3cf3aed..09d7145424de 100644
--- a/test/CodeGen/AMDGPU/load-local-f32.ll
+++ b/test/CodeGen/AMDGPU/load-local-f32.ll
@@ -7,7 +7,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load float, float addrspace(3)* %in
   store float %tmp0, float addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <2 x float>, <2 x float> addrspace(3)* %in
   store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -38,7 +38,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in
   store <3 x float> %tmp0, <3 x float> addrspace(3)* %out
@@ -52,7 +52,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <4 x float>, <4 x float> addrspace(3)* %in
   store <4 x float> %tmp0, <4 x float> addrspace(3)* %out
@@ -71,7 +71,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <8 x float>, <8 x float> addrspace(3)* %in
   store <8 x float> %tmp0, <8 x float> addrspace(3)* %out
@@ -100,7 +100,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <16 x float>, <16 x float> addrspace(3)* %in
   store <16 x float> %tmp0, <16 x float> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-f64.ll b/test/CodeGen/AMDGPU/load-local-f64.ll
index 27d39b7e9d7d..9ad6c087bf2e 100644
--- a/test/CodeGen/AMDGPU/load-local-f64.ll
+++ b/test/CodeGen/AMDGPU/load-local-f64.ll
@@ -9,7 +9,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
   %ld = load double, double addrspace(3)* %in
   store double %ld, double addrspace(3)* %out
   ret void
@@ -22,7 +22,7 @@ define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in)
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x double>, <2 x double> addrspace(3)* %in
   store <2 x double> %ld, <2 x double> addrspace(3)* %out
@@ -39,7 +39,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x double>, <3 x double> addrspace(3)* %in
   store <3 x double> %ld, <3 x double> addrspace(3)* %out
@@ -59,7 +59,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x double>, <4 x double> addrspace(3)* %in
   store <4 x double> %ld, <4 x double> addrspace(3)* %out
@@ -88,7 +88,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x double>, <8 x double> addrspace(3)* %in
   store <8 x double> %ld, <8 x double> addrspace(3)* %out
@@ -144,7 +144,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x double>, <16 x double> addrspace(3)* %in
   store <16 x double> %ld, <16 x double> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i1.ll b/test/CodeGen/AMDGPU/load-local-i1.ll
index 2eed9917b5e5..e8f134b1fb2e 100644
--- a/test/CodeGen/AMDGPU/load-local-i1.ll
+++ b/test/CodeGen/AMDGPU/load-local-i1.ll
@@ -10,56 +10,56 @@
 ; EG: LDS_UBYTE_READ_RET
 ; EG: AND_INT
 ; EG: LDS_BYTE_WRITE
-define void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %load = load i1, i1 addrspace(3)* %in
   store i1 %load, i1 addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v2i1:
-define void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   store <2 x i1> %load, <2 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v3i1:
-define void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   store <3 x i1> %load, <3 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v4i1:
-define void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   store <4 x i1> %load, <4 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v8i1:
-define void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   store <8 x i1> %load, <8 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v16i1:
-define void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   store <16 x i1> %load, <16 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v32i1:
-define void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   store <32 x i1> %load, <32 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v64i1:
-define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   store <64 x i1> %load, <64 x i1> addrspace(3)* %out
   ret void
@@ -68,7 +68,7 @@ define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(
 ; FUNC-LABEL: {{^}}local_zextload_i1_to_i32:
 ; GCN: ds_read_u8
 ; GCN: ds_write_b32
-define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -82,7 +82,7 @@ define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -90,7 +90,7 @@ define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
-define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -98,7 +98,7 @@ define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
-define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -106,7 +106,7 @@ define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
-define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -114,7 +114,7 @@ define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
-define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -122,7 +122,7 @@ define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
-define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
@@ -130,7 +130,7 @@ define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
-define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
@@ -138,7 +138,7 @@ define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
-define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -146,7 +146,7 @@ define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
-define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -154,7 +154,7 @@ define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
-define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -162,7 +162,7 @@ define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
-define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -170,7 +170,7 @@ define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
-define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -178,7 +178,7 @@ define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
-define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -186,7 +186,7 @@ define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
-define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -194,7 +194,7 @@ define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
-define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -202,7 +202,7 @@ define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
-define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -210,7 +210,7 @@ define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
-define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -221,7 +221,7 @@ define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 ; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN: ds_write_b64
-define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -233,7 +233,7 @@ define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: ds_write_b64
-define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -241,7 +241,7 @@ define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
-define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -249,7 +249,7 @@ define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
-define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -257,7 +257,7 @@ define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
-define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -265,7 +265,7 @@ define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
-define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -273,7 +273,7 @@ define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
-define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
@@ -281,7 +281,7 @@ define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
-define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
@@ -289,7 +289,7 @@ define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
-define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -297,7 +297,7 @@ define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
-define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -305,7 +305,7 @@ define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
-define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -313,7 +313,7 @@ define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
-define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -321,7 +321,7 @@ define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
-define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -329,7 +329,7 @@ define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
-define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -337,7 +337,7 @@ define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
-define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -345,7 +345,7 @@ define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
-define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -353,7 +353,7 @@ define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
-define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -361,7 +361,7 @@ define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
-define void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll
index d4e86de66aff..bbbb34e8d333 100644
--- a/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -10,7 +10,7 @@
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
 entry:
   %ld = load i16, i16 addrspace(3)* %in
   store i16 %ld, i16 addrspace(3)* %out
@@ -25,7 +25,7 @@ entry:
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
@@ -39,7 +39,7 @@ entry:
 
 ; EG-DAG: LDS_USHORT_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
@@ -51,7 +51,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
@@ -65,7 +65,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
@@ -86,7 +86,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
@@ -102,7 +102,7 @@ entry:
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -121,7 +121,7 @@ define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -136,7 +136,7 @@ define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -153,7 +153,7 @@ define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -166,7 +166,7 @@ define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -181,7 +181,7 @@ define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1
 ; EG: LDS_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -194,7 +194,7 @@ define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1
 ; GCN-DAG: ds_write_b64
 
 ; EG: LDS_READ_RET
-define void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -211,7 +211,7 @@ entry:
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -226,7 +226,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -244,7 +244,7 @@ define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -258,7 +258,7 @@ define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -280,7 +280,7 @@ define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -304,7 +304,7 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -340,7 +340,7 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -369,7 +369,7 @@ define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -406,7 +406,7 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -471,7 +471,7 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -512,7 +512,7 @@ define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -531,7 +531,7 @@ define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -558,7 +558,7 @@ define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -573,7 +573,7 @@ define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -590,7 +590,7 @@ define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -600,7 +600,7 @@ define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1
 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -612,7 +612,7 @@ define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1
 ; EG: LDS_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
-define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -623,7 +623,7 @@ define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -638,7 +638,7 @@ define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -651,7 +651,7 @@ define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -672,7 +672,7 @@ define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -689,7 +689,7 @@ define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -722,7 +722,7 @@ define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -747,7 +747,7 @@ define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -804,7 +804,7 @@ define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -812,7 +812,7 @@ define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 }
 
 ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
-; define void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -820,7 +820,7 @@ define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 ; }
 
 ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
-; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i32.ll b/test/CodeGen/AMDGPU/load-local-i32.ll
index 280f9658ef8d..86055413d2cf 100644
--- a/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -9,7 +9,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(3)* %in
   store i32 %ld, i32 addrspace(3)* %out
@@ -18,7 +18,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_load_v2i32:
 ; GCN: ds_read_b64
-define void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(3)* %out
@@ -28,7 +28,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_load_v3i32:
 ; GCN-DAG: ds_read_b64
 ; GCN-DAG: ds_read_b32
-define void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(3)* %out
@@ -38,7 +38,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_load_v4i32:
 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
 
-define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(3)* %out
@@ -48,7 +48,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_load_v8i32:
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(3)* %out
@@ -64,7 +64,7 @@ entry:
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(3)* %out
@@ -72,7 +72,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
-define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
   %ld = load i32, i32 addrspace(3)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -80,7 +80,7 @@ define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)*
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
-define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
   %ld = load i32, i32 addrspace(3)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -88,7 +88,7 @@ define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)*
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
-define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -96,7 +96,7 @@ define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
-define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -104,7 +104,7 @@ define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
-define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -112,7 +112,7 @@ define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
-define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -120,7 +120,7 @@ define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
-define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -128,7 +128,7 @@ define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
-define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -136,7 +136,7 @@ define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
-define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -144,7 +144,7 @@ define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
-define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -152,7 +152,7 @@ define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
-define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -160,7 +160,7 @@ define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
-define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -168,7 +168,7 @@ define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
-define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -176,7 +176,7 @@ define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
-define void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i64.ll b/test/CodeGen/AMDGPU/load-local-i64.ll
index 180807df7b9a..0c719a9e0bf9 100644
--- a/test/CodeGen/AMDGPU/load-local-i64.ll
+++ b/test/CodeGen/AMDGPU/load-local-i64.ll
@@ -9,7 +9,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
   %ld = load i64, i64 addrspace(3)* %in
   store i64 %ld, i64 addrspace(3)* %out
   ret void
@@ -22,7 +22,7 @@ define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(3)* %out
@@ -39,7 +39,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(3)* %out
@@ -59,7 +59,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(3)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(3)* %out
@@ -88,7 +88,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(3)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(3)* %out
@@ -144,7 +144,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(3)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll
index 9ffc74213dd5..731996ec6c45 100644
--- a/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -9,7 +9,7 @@
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(3)* %in
   store i8 %ld, i8 addrspace(3)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
@@ -33,7 +33,7 @@ entry:
 ; GCN: ds_read_b32
 
 ; EG: DS_READ_RET
-define void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
@@ -44,7 +44,7 @@ entry:
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
@@ -56,7 +56,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
@@ -71,7 +71,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
@@ -84,7 +84,7 @@ entry:
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -98,7 +98,7 @@ define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %ld = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -108,7 +108,7 @@ define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %
 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -119,7 +119,7 @@ define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8>
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -130,7 +130,7 @@ define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8>
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -156,7 +156,7 @@ define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8>
 ; EG: LDS_USHORT_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -172,7 +172,7 @@ define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8>
 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -197,7 +197,7 @@ entry:
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -214,7 +214,7 @@ entry:
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -231,7 +231,7 @@ define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8>
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -248,7 +248,7 @@ define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8>
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -267,7 +267,7 @@ define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8>
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -292,7 +292,7 @@ define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8>
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -321,7 +321,7 @@ define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -338,7 +338,7 @@ define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -355,7 +355,7 @@ define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -380,7 +380,7 @@ define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -405,7 +405,7 @@ define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -420,7 +420,7 @@ define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 ; EG: LDS_UBYTE_READ_RET
 ; EG: MOV {{.*}}, literal
 ; EG: 0.0
-define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -437,7 +437,7 @@ define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -450,7 +450,7 @@ define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %
 ; EG: MOV {{.*}}, literal
 ; TODO: merge?
 ; EG: 0.0
-define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -463,7 +463,7 @@ define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8>
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -473,7 +473,7 @@ define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8>
 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -485,7 +485,7 @@ define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8>
 ; EG: LDS_USHORT_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -495,7 +495,7 @@ define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8>
 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -505,7 +505,7 @@ define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8>
 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
 
 ; EG: LDS_READ_RET
-define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -516,7 +516,7 @@ define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8>
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -536,7 +536,7 @@ define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8>
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -549,7 +549,7 @@ define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8>
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -562,7 +562,7 @@ define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -579,7 +579,7 @@ define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -596,7 +596,7 @@ define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -604,7 +604,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
-; define void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -612,7 +612,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
-; define void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -625,7 +625,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(3)* %out
@@ -639,7 +639,7 @@ define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(3)* %out
@@ -650,7 +650,7 @@ define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
@@ -662,7 +662,7 @@ define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8>
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
@@ -673,7 +673,7 @@ define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8>
 
 ; EG: LDS_USHORT_READ_RET
 ; EG: LDS_WRITE
-define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
@@ -686,7 +686,7 @@ define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8>
 ; EG: BFE_INT
 ; EG: BFE_INT
 ; EG: LDS_WRITE
-define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
@@ -698,7 +698,7 @@ define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8>
 ; EG: LDS_READ_RET
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
@@ -715,7 +715,7 @@ define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8>
 ; EG-DAG: BFE_INT
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
@@ -730,7 +730,7 @@ define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8>
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
@@ -754,7 +754,7 @@ define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8>
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
@@ -775,7 +775,7 @@ define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8>
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
@@ -813,7 +813,7 @@ define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
@@ -846,7 +846,7 @@ define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
@@ -908,7 +908,7 @@ define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
@@ -916,7 +916,7 @@ define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
-; define void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
@@ -924,7 +924,7 @@ define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
-; define void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll
index bc5e4945fb04..d6162c388b5b 100644
--- a/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -8,7 +8,7 @@
 ; SI: {{flat|buffer}}_load_ubyte
 ; SI: {{flat|buffer}}_load_ushort
 ; SI: {{flat|buffer}}_store_dword
-define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
   %1 = load i24, i24 addrspace(1)* %in
   %2 = zext i24 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
 
 ; CI-HSA: flat_load_dword [[VAL:v[0-9]+]]
 ; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
   %1 = load i25, i25 addrspace(1)* %in
   %2 = zext i25 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll
index a7cee43187c1..bf4a93237bd4 100644
--- a/test/CodeGen/AMDGPU/local-64.ll
+++ b/test/CodeGen/AMDGPU/local-64.ll
@@ -5,7 +5,7 @@
 ; BOTH-LABEL: {{^}}local_i32_load
 ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
 ; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
   %val = load i32, i32 addrspace(3)* %gep, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -15,7 +15,7 @@ define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounw
 ; BOTH-LABEL: {{^}}local_i32_load_0_offset
 ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
 ; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %val = load i32, i32 addrspace(3)* %in, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@ define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
 ; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
   %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
@@ -40,7 +40,7 @@ define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
 ; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
 ; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
   %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
@@ -51,7 +51,7 @@ define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspa
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
   %val = load i64, i64 addrspace(3)* %gep, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -61,7 +61,7 @@ define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounw
 ; BOTH-LABEL: {{^}}local_i64_load_0_offset
 ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %val = load i64, i64 addrspace(3)* %in, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -71,7 +71,7 @@ define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %gep = getelementptr double, double addrspace(3)* %in, i32 7
   %val = load double, double addrspace(3)* %gep, align 8
   store double %val, double addrspace(1)* %out, align 8
@@ -81,7 +81,7 @@ define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in)
 ; BOTH-LABEL: {{^}}local_f64_load_0_offset
 ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %val = load double, double addrspace(3)* %in, align 8
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -90,7 +90,7 @@ define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace
 ; BOTH-LABEL: {{^}}local_i64_store:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
   ret void
@@ -99,7 +99,7 @@ define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
 ; BOTH-LABEL: {{^}}local_i64_store_0_offset:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
 }
@@ -107,7 +107,7 @@ define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
 ; BOTH-LABEL: {{^}}local_f64_store:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_f64_store(double addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind {
   %gep = getelementptr double, double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
   ret void
@@ -115,7 +115,7 @@ define void @local_f64_store(double addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_f64_store_0_offset
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
 }
@@ -124,7 +124,7 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
 ; BOTH-NOT: ADD
 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
 ; BOTH: s_endpgm
-define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
   store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
   ret void
@@ -134,7 +134,7 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
 ; BOTH-NOT: ADD
 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
 ; BOTH: s_endpgm
-define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
   ret void
 }
@@ -144,7 +144,7 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
 ; BOTH: s_endpgm
-define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
   store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
   ret void
@@ -155,7 +155,7 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
 ; BOTH: s_endpgm
-define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/local-atomics.ll b/test/CodeGen/AMDGPU/local-atomics.ll
index 6714a28aa43a..de029d964b0d 100644
--- a/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/test/CodeGen/AMDGPU/local-atomics.ll
@@ -11,7 +11,7 @@
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; EG: LDS_WRXCHG_RET *
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -37,7 +37,7 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -47,7 +47,7 @@ define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_ADD_RET *
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -59,7 +59,7 @@ define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -73,7 +73,7 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -84,7 +84,7 @@ define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -96,7 +96,7 @@ define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -109,7 +109,7 @@ define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 a
 ; EG: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -119,7 +119,7 @@ define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -131,7 +131,7 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -142,7 +142,7 @@ define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; EG: LDS_AND_RET *
 ; GCN: ds_and_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -163,7 +163,7 @@ define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_AND_RET *
 ; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -174,7 +174,7 @@ define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@ define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %pt
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -195,7 +195,7 @@ define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -205,7 +205,7 @@ define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -214,7 +214,7 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i32 %result, i32 addrspace(1)* %out, align 4
 ;   ret void
@@ -224,7 +224,7 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -234,7 +234,7 @@ define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -245,7 +245,7 @@ define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -255,7 +255,7 @@ define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -266,7 +266,7 @@ define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -276,7 +276,7 @@ define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -287,7 +287,7 @@ define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -297,7 +297,7 @@ define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -310,7 +310,7 @@ define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -318,7 +318,7 @@ define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -330,7 +330,7 @@ define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_add_u32 [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -338,7 +338,7 @@ define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
 ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -348,7 +348,7 @@ define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -360,7 +360,7 @@ define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
@@ -369,7 +369,7 @@ define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
@@ -379,7 +379,7 @@ define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -390,7 +390,7 @@ define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
 ; GCN: ds_sub_u32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -398,7 +398,7 @@ define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
 ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -408,7 +408,7 @@ define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
@@ -417,7 +417,7 @@ define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
@@ -426,7 +426,7 @@ define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
 ; GCN: ds_and_b32
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -434,7 +434,7 @@ define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
 ; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -443,7 +443,7 @@ define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
 ; GCN: ds_or_b32
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -451,7 +451,7 @@ define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
 ; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -460,7 +460,7 @@ define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
 ; GCN: ds_xor_b32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -468,7 +468,7 @@ define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
 ; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -476,7 +476,7 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   ret void
 ; }
@@ -484,7 +484,7 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
 ; GCN: ds_min_i32
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -492,7 +492,7 @@ define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
 ; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -501,7 +501,7 @@ define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
 ; GCN: ds_max_i32
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -509,7 +509,7 @@ define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
 ; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -518,7 +518,7 @@ define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
 ; GCN: ds_min_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -526,7 +526,7 @@ define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
 ; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -535,7 +535,7 @@ define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
 ; GCN: ds_max_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -543,7 +543,7 @@ define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
 ; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll
index c88917812eda..6572a7bcd4fe 100644
--- a/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -13,7 +13,7 @@ define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -23,7 +23,7 @@ define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
 ; GCN: ds_add_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -38,7 +38,7 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -51,7 +51,7 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -60,7 +60,7 @@ define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset:
 ; GCN: ds_add_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -70,7 +70,7 @@ define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64:
 ; GCN: ds_sub_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -79,7 +79,7 @@ define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -92,7 +92,7 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -101,7 +101,7 @@ define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset:
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -111,7 +111,7 @@ define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64:
 ; GCN: ds_and_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -120,7 +120,7 @@ define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
 ; GCN: ds_and_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -130,7 +130,7 @@ define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64:
 ; GCN: ds_or_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -139,7 +139,7 @@ define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %pt
 ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
 ; GCN: ds_or_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -149,7 +149,7 @@ define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(
 ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64:
 ; GCN: ds_xor_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -158,7 +158,7 @@ define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
 ; GCN: ds_xor_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -167,7 +167,7 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
@@ -176,7 +176,7 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64:
 ; GCN: ds_min_rtn_i64
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -185,7 +185,7 @@ define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
 ; GCN: ds_min_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -195,7 +195,7 @@ define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64:
 ; GCN: ds_max_rtn_i64
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -204,7 +204,7 @@ define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
 ; GCN: ds_max_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -214,7 +214,7 @@ define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64:
 ; GCN: ds_min_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -223,7 +223,7 @@ define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
 ; GCN: ds_min_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -233,7 +233,7 @@ define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64:
 ; GCN: ds_max_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -242,7 +242,7 @@ define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
 ; GCN: ds_max_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -252,7 +252,7 @@ define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -260,7 +260,7 @@ define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -269,7 +269,7 @@ define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_add_noret_i64:
 ; GCN: ds_add_u64
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -282,7 +282,7 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   ret void
@@ -293,7 +293,7 @@ define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
@@ -301,7 +301,7 @@ define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset:
 ; GCN: ds_add_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
@@ -310,7 +310,7 @@ define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64:
 ; GCN: ds_sub_u64
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -318,7 +318,7 @@ define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -329,7 +329,7 @@ define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
@@ -337,7 +337,7 @@ define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset:
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
@@ -346,7 +346,7 @@ define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64:
 ; GCN: ds_and_b64
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -354,7 +354,7 @@ define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
 ; GCN: ds_and_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -363,7 +363,7 @@ define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64:
 ; GCN: ds_or_b64
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -371,7 +371,7 @@ define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
 ; GCN: ds_or_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -380,7 +380,7 @@ define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64:
 ; GCN: ds_xor_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -388,7 +388,7 @@ define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
 ; GCN: ds_xor_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -396,7 +396,7 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   ret void
 ; }
@@ -404,7 +404,7 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64:
 ; GCN: ds_min_i64
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -412,7 +412,7 @@ define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
 ; GCN: ds_min_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -421,7 +421,7 @@ define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64:
 ; GCN: ds_max_i64
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -429,7 +429,7 @@ define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
 ; GCN: ds_max_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -438,7 +438,7 @@ define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64:
 ; GCN: ds_min_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -446,7 +446,7 @@ define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
 ; GCN: ds_min_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -455,7 +455,7 @@ define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64:
 ; GCN: ds_max_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -463,7 +463,7 @@ define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
 ; GCN: ds_max_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
diff --git a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index a57e4f595322..47b6558241b9 100644
--- a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -17,7 +17,7 @@
 ; GCN: s_barrier
 
 ; GCN: ds_read_b32 {{v[0-9]+}},
-define void @local_memory(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
 entry:
   %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
@@ -45,11 +45,7 @@ entry:
 ; GCN-LABEL: {{^}}local_memory_two_objects:
 ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-
-; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
-
-; SI-DAG: ds_write_b32 [[ADDRW]],
-; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
+; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
 
 ; GCN: s_barrier
 
@@ -61,7 +57,7 @@ entry:
 
 ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
-define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x()
   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll
index 1a11332f865d..6124237d7638 100644
--- a/test/CodeGen/AMDGPU/local-memory.ll
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -14,7 +14,7 @@
 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
 
 ; R600: LDS_READ_RET
-define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
 entry:
   %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
   %tmp1 = load i32, i32 addrspace(3)* %tmp0
@@ -30,7 +30,7 @@ entry:
 ; R600: LDS_READ_RET
 ; GCN-DAG: ds_read_b32
 ; GCN-DAG: ds_read2_b32
-define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
   %scalar = load i32, i32 addrspace(3)* %in
   %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
   %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
diff --git a/test/CodeGen/AMDGPU/local-memory.r600.ll b/test/CodeGen/AMDGPU/local-memory.r600.ll
index 9841b8882b39..c8f4e4c986a7 100644
--- a/test/CodeGen/AMDGPU/local-memory.r600.ll
+++ b/test/CodeGen/AMDGPU/local-memory.r600.ll
@@ -15,7 +15,7 @@
 ; EG-NEXT: ALU clause
 
 ; EG: LDS_READ_RET
-define void @local_memory(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
 entry:
   %y.i = call i32 @llvm.r600.read.tidig.x() #1
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
@@ -57,7 +57,7 @@ entry:
 ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 
-define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.r600.read.tidig.x() #1
   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
index dc43e8613ddf..d3e0f0be4b5f 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -8,13 +8,12 @@
 ; CHECK-LABEL: {{^}}main:
 
 ; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200
-; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}}
 ; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
 ; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
 
-; TODO: add 0?
-; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]]
-; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]]
+; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]]
+; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]]
 
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
diff --git a/test/CodeGen/AMDGPU/loop-address.ll b/test/CodeGen/AMDGPU/loop-address.ll
index f60d574497de..e25d4f4b4f5f 100644
--- a/test/CodeGen/AMDGPU/loop-address.ll
+++ b/test/CodeGen/AMDGPU/loop-address.ll
@@ -5,7 +5,7 @@
 ;CHECK: LOOP_BREAK @10
 ;CHECK: POP @10
 
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
+define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
diff --git a/test/CodeGen/AMDGPU/loop-idiom.ll b/test/CodeGen/AMDGPU/loop-idiom.ll
index 5fd9806813cd..23ddd6488af9 100644
--- a/test/CodeGen/AMDGPU/loop-idiom.ll
+++ b/test/CodeGen/AMDGPU/loop-idiom.ll
@@ -9,7 +9,7 @@
 ; FUNC: @no_memcpy
 ; R600-NOT: {{^}}llvm.memcpy
 ; SI-NOT: {{^}}llvm.memcpy
-define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
+define amdgpu_kernel void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
 entry:
   %dest = alloca i8, i32 32
   br label %for.body
@@ -33,7 +33,7 @@ for.end:
 ; R600-NOT: {{^}}memset_pattern16:
 ; SI-NOT: {{^}}llvm.memset
 ; SI-NOT: {{^}}memset_pattern16:
-define void @no_memset(i32 %size) {
+define amdgpu_kernel void @no_memset(i32 %size) {
 entry:
   %dest = alloca i8, i32 32
   br label %for.body
diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll
index 82564b8bb28d..b9df2cb779ad 100644
--- a/test/CodeGen/AMDGPU/loop_break.ll
+++ b/test/CodeGen/AMDGPU/loop_break.ll
@@ -27,8 +27,9 @@
 
 ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
 ; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]]
-; GCN: s_cmp_gt_i32 s{{[0-9]+}}, -1
-; GCN-NEXT: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]]
+; GCN: v_cmp_lt_i32_e32 vcc, -1
+; GCN: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
 
 ; GCN: ; BB#2: ; %bb4
 ; GCN: buffer_load_dword
@@ -43,7 +44,7 @@
 ; GCN: ; BB#4: ; %bb9
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_endpgm
-define void @break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -64,6 +65,264 @@ bb9:
   ret void
 }
 
+; OPT-LABEL: @undef_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 undef, i64 %phi.broken)
+; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+
+; OPT: bb9:                                              ; preds = %Flow
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: store volatile i32 7
+; OPT-NEXT: ret void
+define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ undef, %bb1 ]
+  br i1 %tmp3, label %bb9, label %bb1
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
+; FIXME: ConstantExpr compare of address to null folds away
+@lds = addrspace(3) global i32 undef
+
+; OPT-LABEL: @constexpr_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), i64 %phi.broken)
+; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+
+; OPT: bb9:                                              ; preds = %Flow
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: store volatile i32 7
+; OPT-NEXT: ret void
+define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ]
+  br i1 %tmp3, label %bb9, label %bb1
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
+; OPT-LABEL: @true_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT: %0 = call i64 @llvm.amdgcn.break(i64 %phi.broken)
+; OPT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+
+; OPT: bb9:                                              ; preds = %Flow
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: store volatile i32 7
+; OPT-NEXT: ret void
+define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+  br i1 %tmp3, label %bb9, label %bb1
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
+; OPT-LABEL: @false_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT-NOT: call
+; OPT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %loop.phi = phi i64 [ %0, %bb4 ], [ %phi.broken, %bb1 ]
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: br i1 %1, label %bb9, label %bb1
+
+; OPT: bb9:                                              ; preds = %Flow
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: store volatile i32 7
+; OPT-NEXT: ret void
+define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ false, %bb1 ]
+  br i1 %tmp3, label %bb9, label %bb1
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
+; Swap order of branches in flow block so that the true phi is
+; continue.
+
+; OPT-LABEL: @invert_true_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %1, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
+; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
+; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+; OPT-NEXT: %0 = xor i1 %tmp3, true
+; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %0, i64 %phi.broken)
+; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %1)
+; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+
+; OPT: bb9:
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %1)
+; OPT-NEXT: store volatile i32 7, i32 addrspace(3)* undef
+; OPT-NEXT: ret void
+define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+  br i1 %tmp3, label %bb1, label %bb9
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
new file mode 100644
index 000000000000..74564f387ede
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -0,0 +1,117 @@
+; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s
+
+declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
+declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
+
+declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
+declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1) #1
+
+; Test the upper bound for sizes to leave
+; OPT-LABEL: @max_size_small_static_memcpy_caller0(
+; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
+define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
+  ret void
+}
+
+; Smallest static size which will be expanded
+; OPT-LABEL: @min_size_large_static_memcpy_caller0(
+; OPT-NOT: call
+; OPT: getelementptr
+; OPT-NEXT: load i8
+; OPT: getelementptr
+; OPT-NEXT: store i8
+define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @max_size_small_static_memmove_caller0(
+; OPT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
+define amdgpu_kernel void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+  call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @min_size_large_static_memmove_caller0(
+; OPT-NOT: call
+; OPT: getelementptr
+; OPT-NEXT: load i8
+; OPT: getelementptr
+; OPT-NEXT: store i8
+define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+  call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @max_size_small_static_memset_caller0(
+; OPT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
+define amdgpu_kernel void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
+  call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @min_size_large_static_memset_caller0(
+; OPT-NOT: call
+; OPT: getelementptr
+; OPT: store i8
+define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
+  call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @variable_memcpy_caller0(
+; OPT-NOT: call
+; OPT: phi
+define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @variable_memcpy_caller1(
+; OPT-NOT: call
+; OPT: phi
+define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @memcpy_multi_use_one_function(
+; OPT-NOT: call
+; OPT: phi
+; OPT-NOT: call
+; OPT: phi
+; OPT-NOT: call
+define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @memcpy_alt_type(
+; OPT: phi
+; OPT: getelementptr inbounds i8, i8 addrspace(3)*
+; OPT: load i8, i8 addrspace(3)*
+; OPT: getelementptr inbounds i8, i8 addrspace(1)*
+; OPT: store i8
+define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; One of the uses in the function should be expanded, the other left alone.
+; OPT-LABEL: @memcpy_multi_use_one_function_keep_small(
+; OPT: getelementptr inbounds i8, i8 addrspace(1)*
+; OPT: load i8, i8 addrspace(1)*
+; OPT: getelementptr inbounds i8, i8 addrspace(1)*
+; OPT: store i8
+
+; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
+define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
index e1fad13e0b51..6f5f4ca13b5e 100644
--- a/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range:
 ; CHECK-NOT: v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
-define void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 1023
@@ -14,9 +14,9 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
-; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
-; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
-define void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
+; CHECK-NOT: v_and_b32
+; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
+define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 511
@@ -26,9 +26,9 @@ entry:
 
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1:
 ; CHECK-NOT: v0
-; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
-; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
-define void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
+; CHECK-NOT: v_and_b32
+; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
+define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
   %and = and i32 %id, 255
diff --git a/test/CodeGen/AMDGPU/lshl.ll b/test/CodeGen/AMDGPU/lshl.ll
deleted file mode 100644
index 8468437c2c1f..000000000000
--- a/test/CodeGen/AMDGPU/lshl.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
-
-;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-
-define void @test(i32 %p) {
-   %i = mul i32 %p, 2
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/lshr.ll b/test/CodeGen/AMDGPU/lshr.ll
deleted file mode 100644
index c8ab7871434e..000000000000
--- a/test/CodeGen/AMDGPU/lshr.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
-
-;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-
-define void @test(i32 %p) {
-   %i = udiv i32 %p, 2
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll
new file mode 100644
index 000000000000..e21d0d09bb41
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -0,0 +1,149 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+
+; GCN-LABEL: {{^}}s_lshr_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
+
+; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+  %result = lshr <2 x i16> %lhs, %rhs
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_lshr_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+
+; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_lshrrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
+; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]]
+; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[RHS]]
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
+  %result = lshr <2 x i16> %a, %b
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_v_s_v2i16:
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = lshr <2 x i16> %vgpr, %sgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_s_v_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = lshr <2 x i16> %sgpr, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_imm_v_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8
+define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = lshr <2 x i16> <i16 8, i16 8>, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_v_imm_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]]
+define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = lshr <2 x i16> %vgpr, <i16 8, i16 8>
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_lshr_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
+  %result = lshr <4 x i16> %a, %b
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_v_imm_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index 9caba32cbac0..b855fc500c6b 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -31,7 +31,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -70,7 +70,7 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -107,7 +107,7 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -137,7 +137,7 @@ define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrsp
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -174,7 +174,7 @@ define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -209,7 +209,7 @@ define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, fl
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -245,7 +245,7 @@ define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -281,7 +281,7 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -319,7 +319,7 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -362,7 +362,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %ou
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -404,7 +404,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %ou
 ; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -447,7 +447,7 @@ define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %o
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -497,7 +497,7 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -548,7 +548,7 @@ define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %o
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 9183ae0972dc..1e78c4ebcc9f 100644
--- a/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -11,7 +11,7 @@ declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
 ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
 ; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0
-define void @get_global_id_0(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1
diff --git a/test/CodeGen/AMDGPU/mad_int24.ll b/test/CodeGen/AMDGPU/mad_int24.ll
index f149ea0a6a0e..af0159aa9b10 100644
--- a/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/test/CodeGen/AMDGPU/mad_int24.ll
@@ -11,7 +11,7 @@
 ; CM: MULADD_INT24
 ; SI-NOT: and
 ; SI: v_mad_i32_i24
-define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = ashr i32 %0, 8
diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll
index 9fde950f822c..2c4f7d324a96 100644
--- a/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: v_mad_u32_u24
 ; VI: v_mad_u32_u24
 
-define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -32,7 +32,7 @@ entry:
 ; FIXME: Should be using scalar instructions here.
 ; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
-define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
   %0 = mul i16 %a, %b
   %1 = add i16 %0, %c
@@ -49,7 +49,7 @@ entry:
 ; EG: 8
 ; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
   %0 = mul i8 %a, %b
   %1 = add i8 %0, %c
@@ -68,7 +68,7 @@ entry:
 ; FUNC-LABEL: {{^}}i24_i32_i32_mad:
 ; EG: CNDE_INT
 ; SI: v_cndmask
-define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   %0 = ashr i32 %a, 8
   %1 = icmp ne i32 %c, 0
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 6722aa79dd5d..eb4066a2a0a8 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -10,7 +10,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -37,7 +37,7 @@ define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]]
 ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
 ; GCN: s_endpgm
-define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@ define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
-define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
+define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -84,7 +84,7 @@ define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addr
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
-define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -106,7 +106,7 @@ define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
+define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -125,7 +125,7 @@ define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -141,7 +141,7 @@ define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float add
 ; GCN-LABEL: {{^}}s_s_madak_f32:
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %mul = fmul float %a, %b
   %madak = fadd float %mul, 10.0
   store float %madak, float addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
 ; GCN: s_endpgm
-define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -175,7 +175,7 @@ define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
 ; GCN: s_endpgm
-define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -201,7 +201,7 @@ define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
+define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
 bb:
   %tmp = icmp eq i32 %arg1, 0
   br i1 %tmp, label %bb3, label %bb4
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index 27fbf58d26c6..6e70e95383c9 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -12,7 +12,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
-define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -35,7 +35,7 @@ define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]]
 ; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
 ; GCN: s_endpgm
-define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
-define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -83,7 +83,7 @@ define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
@@ -97,7 +97,7 @@ define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b)
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mad_f32
 ; GCN: s_endpgm
-define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
+define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -113,7 +113,7 @@ define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
+define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@ define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float add
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], |[[VA]]|, [[VB]]
-define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -151,7 +151,7 @@ define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
-define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -172,7 +172,7 @@ define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
-define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -189,7 +189,7 @@ define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float ad
 ; SI: s_xor_b64
 ; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}}
 ; SI: s_or_b64
-define void @kill_madmk_verifier_error() nounwind {
+define amdgpu_kernel void @kill_madmk_verifier_error() nounwind {
 bb:
   br label %bb2
 
diff --git a/test/CodeGen/AMDGPU/max.i16.ll b/test/CodeGen/AMDGPU/max.i16.ll
index 3f2a87f20691..abd75258c4d4 100644
--- a/test/CodeGen/AMDGPU/max.i16.ll
+++ b/test/CodeGen/AMDGPU/max.i16.ll
@@ -1,12 +1,10 @@
-; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VIPLUS %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VIPLUS %s
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_imax_sge_i16:
-; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+; VIPLUS: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -20,12 +18,56 @@ define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-; GCN-LABEL: {{^}}v_test_imax_sge_v4i16:
+; GCN-LABEL: {{^}}v_test_imax_sge_v2i16:
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4
+  %cmp = icmp sge <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_imax_sge_v3i16:
+; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-NOT: v_max_i16
+
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i16>, <3 x i16> addrspace(1)* %gep0, align 4
+  %b = load <3 x i16>, <3 x i16> addrspace(1)* %gep1, align 4
+  %cmp = icmp sge <3 x i16> %a, %b
+  %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  store <3 x i16> %val, <3 x i16> addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_imax_sge_v4i16:
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid
@@ -40,8 +82,8 @@ define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrs
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_imax_sgt_i16:
-; VI: v_max_i16_e32
-define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+; VIPLUS: v_max_i16_e32
+define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -56,8 +98,8 @@ define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_umax_uge_i16:
-; VI: v_max_u16_e32
-define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+; VIPLUS: v_max_u16_e32
+define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -72,8 +114,8 @@ define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_umax_ugt_i16:
-; VI: v_max_u16_e32
-define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+; VIPLUS: v_max_u16_e32
+define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -85,3 +127,23 @@ define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
   store i16 %val, i16 addrspace(1)* %outgep, align 4
   ret void
 }
+
+; GCN-LABEL: {{^}}v_test_umax_ugt_v2i16:
+; VI: v_max_u16_e32
+; VI: v_max_u16_sdwa
+
+; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4
+  %cmp = icmp ugt <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll
index 5fa307be0fd5..ffcdac03bc74 100644
--- a/test/CodeGen/AMDGPU/max.ll
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -6,7 +6,7 @@
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sge i32 %a, %b
@@ -26,7 +26,7 @@ define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %bptr, align 4
   %cmp = icmp sge <4 x i32> %a, %b
@@ -39,7 +39,7 @@ define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrs
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -50,7 +50,7 @@ define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sge i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -63,7 +63,7 @@ define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
   %a = load i8, i8 addrspace(1)* %aptr, align 1
   %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp sge i8 %a, %b
@@ -76,7 +76,7 @@ define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sgt i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@ define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp sgt <2 x i32> %a, <i32 9, i32 9>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 9, i32 9>
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
@@ -100,7 +100,7 @@ define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sgt i32 %a, %b
@@ -113,7 +113,7 @@ define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sgt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -124,7 +124,7 @@ define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp uge i32 %a, %b
@@ -137,7 +137,7 @@ define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -155,7 +155,7 @@ define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; EG: MAX_UINT
 ; EG: MAX_UINT
 ; EG-NOT: MAX_UINT
-define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
   %cmp = icmp uge <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
   store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4
@@ -168,7 +168,7 @@ define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
   %a = load i8, i8 addrspace(1)* %aptr, align 1
   %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp uge i8 %a, %b
@@ -181,7 +181,7 @@ define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ugt i32 %a, %b
@@ -194,7 +194,7 @@ define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp ugt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -207,7 +207,7 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
-define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
@@ -223,7 +223,7 @@ define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_UINT
-define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ugt i32 %a.ext, %b.ext
@@ -243,7 +243,7 @@ define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i1
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_INT
-define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp sgt i32 %a.ext, %b.ext
@@ -262,7 +262,7 @@ define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
@@ -275,7 +275,7 @@ define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp ugt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -287,7 +287,7 @@ define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp uge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -299,7 +299,7 @@ define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sgt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -311,7 +311,7 @@ define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll
index a12dba2eb6e9..4bb4fd46becd 100644
--- a/test/CodeGen/AMDGPU/max3.ll
+++ b/test/CodeGen/AMDGPU/max3.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: @v_test_imax3_sgt_i32
 ; SI: v_max3_i32
-define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 
 ; FUNC-LABEL: @v_test_umax3_ugt_i32
 ; SI: v_max3_u32
-define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/mem-builtins.ll b/test/CodeGen/AMDGPU/mem-builtins.ll
index 97512670f59e..1cbd0c327510 100644
--- a/test/CodeGen/AMDGPU/mem-builtins.ll
+++ b/test/CodeGen/AMDGPU/mem-builtins.ll
@@ -9,7 +9,7 @@ declare i32 @strcmp(i8* nocapture, i8* nocapture) #1
 
 
 ; ERROR: error: <unknown>:0:0: in function test_memcmp void (i8 addrspace(1)*, i8 addrspace(1)*, i32*): unsupported call to function memcmp
-define void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 {
+define amdgpu_kernel void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 {
 entry:
   %cmp = tail call i32 @memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i64 2)
   store volatile i32 %cmp, i32 addrspace(1)* undef
@@ -17,35 +17,35 @@ entry:
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_memchr void (i8 addrspace(1)*, i32, i64): unsupported call to function memchr
-define void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 {
+define amdgpu_kernel void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 {
   %res = call i8 addrspace(1)* @memchr(i8 addrspace(1)* %src, i32 %char, i64 %len)
   store volatile i8 addrspace(1)* %res, i8 addrspace(1)* addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strcpy void (i8*, i8*): unsupported call to function strcpy
-define void @test_strcpy(i8* %dst, i8* %src) #0 {
+define amdgpu_kernel void @test_strcpy(i8* %dst, i8* %src) #0 {
   %res = call i8* @strcpy(i8* %dst, i8* %src)
   store volatile i8* %res, i8* addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strcmp void (i8*, i8*): unsupported call to function strcmp
-define void @test_strcmp(i8* %src0, i8* %src1) #0 {
+define amdgpu_kernel void @test_strcmp(i8* %src0, i8* %src1) #0 {
   %res = call i32 @strcmp(i8* %src0, i8* %src1)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strlen void (i8*): unsupported call to function strlen
-define void @test_strlen(i8* %src) #0 {
+define amdgpu_kernel void @test_strlen(i8* %src) #0 {
   %res = call i32 @strlen(i8* %src)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strnlen void (i8*, i32): unsupported call to function strnlen
-define void @test_strnlen(i8* %src, i32 %size) #0 {
+define amdgpu_kernel void @test_strnlen(i8* %src, i32 %size) #0 {
   %res = call i32 @strnlen(i8* %src, i32 %size)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
index 07104ebc8c97..dfd5b97fcc86 100644
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,8 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
-
-; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
 
 ; This test is mostly to test DAG store merging, so disable the vectorizer.
 ; Run with devices with different unaligned load restrictions.
@@ -16,7 +13,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -28,7 +25,7 @@ define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -38,7 +35,7 @@ define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -48,7 +45,7 @@ define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 0, i16 addrspace(1)* %out.gep.1
@@ -60,7 +57,7 @@ define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -72,7 +69,7 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)*
 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -82,7 +79,7 @@ define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
 ; GCN: buffer_store_dwordx2
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
   store float 1.0, float addrspace(1)* %out.gep.1.bc
@@ -94,7 +91,7 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
   store i32 123, i32 addrspace(1)* %out.gep.1.bc
@@ -108,7 +105,7 @@ define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -122,7 +119,7 @@ define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -137,7 +134,7 @@ define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out)
 ; First store is out of order.
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -150,14 +147,9 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
-; GCN-NOAA: buffer_store_dwordx4 v
-
-; GCN-AA: buffer_store_dwordx2
-; GCN-AA: buffer_store_dword v
-; GCN-AA: buffer_store_dword v
-
+; GCN-AA: buffer_store_dwordx4 v
 ; GCN: s_endpgm
-define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -177,7 +169,7 @@ define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %o
 ; SI-DAG: buffer_store_dword
 ; SI-NOT: buffer_store_dword
 ; GCN: s_endpgm
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 
@@ -189,7 +181,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
   store i64 123, i64 addrspace(1)* %out.gep.1
@@ -200,7 +192,7 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
@@ -215,7 +207,7 @@ define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx2 [[LOAD]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -230,7 +222,7 @@ define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 
@@ -249,7 +241,7 @@ define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; GCN: buffer_load_dword v
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -264,7 +256,7 @@ define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -291,7 +283,7 @@ define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; SI-DAG: buffer_store_dword v
 ; SI-DAG: buffer_store_dwordx2 v
 ; GCN: s_endpgm
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -310,7 +302,7 @@ define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -333,7 +325,7 @@ define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, f
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
@@ -359,7 +351,7 @@ define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: s_barrier
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -396,7 +388,7 @@ define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -424,7 +416,7 @@ define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 ; GCN: buffer_store_dword [[LOAD]]
 ; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -454,7 +446,7 @@ define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 ad
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -474,19 +466,11 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
   ret void
 }
 
-; This works once AA is enabled on the subtarget
 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-
-; GCN-AA: buffer_store_dwordx4 [[LOAD]]
-
+; GCN: buffer_store_dwordx4 [[LOAD]]
 ; GCN: s_endpgm
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -508,7 +492,7 @@ define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 
   store i8 123, i8 addrspace(3)* %out.gep.1
@@ -520,7 +504,7 @@ define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1
@@ -538,7 +522,7 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
 
 ; GCN: s_endpgm
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
@@ -556,7 +540,7 @@ define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 ; GCN: buffer_store_dword v[[HI]]
-define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 12, i32 addrspace(1)* %idx1, align 4
@@ -572,7 +556,7 @@ define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx2
-define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
   store i32 13, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 15, i32 addrspace(1)* %idx1, align 4
@@ -591,7 +575,7 @@ define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dword v
-define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -612,7 +596,7 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -646,7 +630,7 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
   ret void
@@ -662,7 +646,7 @@ define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> a
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
   ret void
@@ -678,7 +662,7 @@ define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> a
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
@@ -695,7 +679,7 @@ define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x floa
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index 19d0117d64a9..e85a724c1567 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,17 +1,22 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
@@ -19,7 +24,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -30,7 +35,7 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp sle <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
   store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -47,7 +52,7 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %cmp = icmp sle <4 x i32> %a, %b
   %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %val, <4 x i32> addrspace(1)* %out
@@ -60,7 +65,7 @@ define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: s_min_i32
-define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
   %cmp = icmp sle i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
   store i8 %val, i8 addrspace(1)* %out
@@ -90,30 +95,62 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
 ; VI: v_min_i32
 ; VI: v_min_i32
 
+; GFX9: v_min_i16
+; GFX9: v_min_i16
+; GFX9: v_min_i16
+; GFX9: v_min_i16
+
 ; GCN: s_endpgm
 
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16:
+; SI: v_min_i32
+; SI: v_min_i32
+
+; VI: v_min_i32
+; VI: v_min_i32
+
+; GFX9: v_pk_min_i16
+
+; EG: MIN_INT
+; EG: MIN_INT
+define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+  %cmp = icmp sle <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: VI use s_min_i32
 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
 ; SI: v_min_i32
 ; SI: v_min_i32
 ; SI: v_min_i32
 ; SI: v_min_i32
 
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+
+; GFX9: v_pk_min_i16
+; GFX9: v_pk_min_i16
+
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 {
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %val, <4 x i16> addrspace(1)* %out
@@ -124,12 +161,36 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imin_slt_i16
+; SI: v_min_i32_e32
+
+; GFX89: v_min_i16_e32
+
+; EG: MIN_INT
+define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid
+  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+
+  %a = load i16, i16 addrspace(1)* %a.gep
+  %b = load i16, i16 addrspace(1)* %b.gep
+  %cmp = icmp slt i16 %a, %b
+  %val = select i1 %cmp, i16 %a, i16 %b
+  store i16 %val, i16 addrspace(1)* %out.gep
   ret void
 }
 
@@ -137,7 +198,7 @@ define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -150,7 +211,7 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out
@@ -161,7 +222,7 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
   %cmp = icmp slt i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -172,7 +233,7 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
   %cmp = icmp sle i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -183,12 +244,16 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
@@ -196,25 +261,65 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; GCN: v_min_u32_e32
 ; GCN: v_min_u32_e32
 ; GCN: v_min_u32_e32
-; SI-NOT: v_min_u32_e32
+; GCN-NOT: v_min_u32_e32
 ; GCN: s_endpgm
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind {
-  %a = load <3 x i32>, <3 x i32> addrspace(1)* %aptr
-  %b = load <3 x i32>, <3 x i32> addrspace(1)* %bptr
+define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
+
+  %a = load <3 x i32>, <3 x i32> addrspace(1)* %a.gep
+  %b = load <3 x i32>, <3 x i32> addrspace(1)* %b.gep
   %cmp = icmp ule <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
-  store <3 x i32> %val, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %val, <3 x i32> addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: Reduce unused packed component to scalar
+; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}}
+; SI: v_min_u32_e32
+; SI: v_min_u32_e32
+; SI: v_min_u32_e32
+; SI-NOT: v_min_u32_e32
+
+; VI: v_min_u16_e32
+; VI: v_min_u16_sdwa
+; VI: v_min_u16_e32
+; VI-NOT: v_min_u16_e32
+
+; GFX9: v_pk_min_u16
+; GFX9: v_pk_min_u16
+
+; GCN: s_endpgm
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid
+
+  %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.gep
+  %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.gep
+  %cmp = icmp ule <3 x i16> %a, %b
+  %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  store <3 x i16> %val, <3 x i16> addrspace(1)* %out.gep
   ret void
 }
+
 ; FUNC-LABEL: @s_test_umin_ule_i32
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -225,27 +330,40 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: v_min_u32_e32
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: v_min_u32_e32
+
+; GFX89: flat_load_ubyte
+; GFX89: flat_load_ubyte
+; GFX89: v_min_u16_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
-  %a = load i8, i8 addrspace(1)* %aptr, align 1
-  %b = load i8, i8 addrspace(1)* %bptr, align 1
+define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %tid
+
+  %a = load i8, i8 addrspace(1)* %a.gep, align 1
+  %b = load i8, i8 addrspace(1)* %b.gep, align 1
   %cmp = icmp ult i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
-  store i8 %val, i8 addrspace(1)* %out, align 1
+  store i8 %val, i8 addrspace(1)* %out.gep, align 1
   ret void
 }
 
@@ -253,7 +371,7 @@ define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -268,7 +386,7 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ult i32 %a, %b
@@ -286,7 +404,7 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
   %a = load i16, i16 addrspace(1)* %aptr, align 2
   %b = load i16, i16 addrspace(1)* %bptr, align 2
   %cmp = icmp ult i16 %a, %b
@@ -301,7 +419,7 @@ define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp ult <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
   store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -326,7 +444,7 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 {
   %cmp = icmp ult <8 x i32> %a, %b
   %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
   store <8 x i32> %val, <8 x i32> addrspace(1)* %out
@@ -334,14 +452,23 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
 }
 
 ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16:
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
@@ -351,7 +478,7 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 {
   %cmp = icmp ult <8 x i16> %a, %b
   %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
   store <8 x i16> %val, <8 x i16> addrspace(1)* %out
@@ -367,7 +494,7 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_UINT
-define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ult i32 %a.ext, %b.ext
@@ -387,7 +514,7 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_INT
-define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp slt i32 %a.ext, %b.ext
@@ -402,7 +529,7 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
   %cmp = icmp sle i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
@@ -415,7 +542,7 @@ define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ult i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -427,7 +554,7 @@ define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ule i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -439,7 +566,7 @@ define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp slt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -451,9 +578,63 @@ define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp sle i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_test_imin_sle_v2i16:
+; SI: v_min_i32
+; SI: v_min_i32
+
+; VI: v_min_i16
+; VI: v_min_i16
+
+; GFX9: v_pk_min_i16
+
+; EG: MIN_INT
+; EG: MIN_INT
+define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
+  %cmp = icmp sle <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: i16 min
+; FUNC-LABEL: {{^}}v_test_imin_ule_v2i16:
+; SI: v_min_u32
+; SI: v_min_u32
+
+; VI: v_min_u16
+; VI: v_min_u16
+
+; GFX9: v_pk_min_u16
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
+  %cmp = icmp ule <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll
index 728479ad9f62..59d5d2cdb1aa 100644
--- a/test/CodeGen/AMDGPU/min3.ll
+++ b/test/CodeGen/AMDGPU/min3.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: @v_test_imin3_slt_i32
 ; SI: v_min3_i32
-define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 
 ; FUNC-LABEL: @v_test_umin3_ult_i32
 ; SI: v_min3_u32
-define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -43,7 +43,7 @@ define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 ; FUNC-LABEL: @v_test_umin_umin_umin
 ; SI: v_min_i32
 ; SI: v_min3_i32
-define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
@@ -77,7 +77,7 @@ define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %ap
 
 ; FUNC-LABEL: @v_test_umin3_2_uses
 ; SI-NOT: v_min3
-define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/missing-store.ll b/test/CodeGen/AMDGPU/missing-store.ll
index 8e1b0036a1af..83c2a911a5ce 100644
--- a/test/CodeGen/AMDGPU/missing-store.ll
+++ b/test/CodeGen/AMDGPU/missing-store.ll
@@ -15,7 +15,7 @@
 ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
   %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
diff --git a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 85dfbe6b8a33..e1fb00a1de30 100644
--- a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -19,7 +19,7 @@
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
 
-define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
+define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
 bb:
   %tmp = icmp sgt i32 %arg3, 0
   br i1 %tmp, label %bb4, label %bb17
diff --git a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
index 1a0a39027853..417b4ba802e1 100644
--- a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
+++ b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}}
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
@@ -31,7 +31,7 @@ exit:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_noret:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}}
-define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
index a574365da986..9e1d2e0490c7 100644
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; MUBUF load with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load0:
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
-define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
   %1 = load i32, i32 addrspace(1)* %0
@@ -20,7 +20,7 @@ entry:
 ; MUBUF load with the largest possible immediate offset
 ; CHECK-LABEL: {{^}}mubuf_load1:
 ; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
-define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
   %1 = load i8, i8 addrspace(1)* %0
@@ -32,7 +32,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_load2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
-define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
   %1 = load i32, i32 addrspace(1)* %0
@@ -44,7 +44,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_load3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
-define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
+define amdgpu_kernel void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset
   %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
@@ -91,7 +91,7 @@ main_body:
 ; MUBUF store with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store0:
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
-define void @mubuf_store0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
   store i32 0, i32 addrspace(1)* %0
@@ -102,7 +102,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store1:
 ; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
 
-define void @mubuf_store1(i8 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095
   store i8 0, i8 addrspace(1)* %0
@@ -113,7 +113,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
-define void @mubuf_store2(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
   store i32 0, i32 addrspace(1)* %0
@@ -124,7 +124,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
-define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
+define amdgpu_kernel void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset
   %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
@@ -134,14 +134,14 @@ entry:
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
-define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
   store i32 99, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
-define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -150,7 +150,7 @@ define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -159,7 +159,7 @@ define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
   ret void
@@ -167,7 +167,7 @@ define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: {{^}}store_vgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 99, i32 addrspace(1)* %out.gep, align 4
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index 7910b70d8cf2..a72a6efb0711 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -11,7 +11,7 @@
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -31,7 +31,7 @@ define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -45,7 +45,7 @@ define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 ; SI: s_load_dword
 ; SI: s_mul_i32
 ; SI: buffer_store_dword
-define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -57,7 +57,7 @@ define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 ; SI: s_load_dword
 ; SI: v_mul_lo_i32
 ; SI: buffer_store_dword
-define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %b, %a
@@ -73,7 +73,7 @@ define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %a
 ; EG-DAG: MULHI_INT
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_i32
-define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
   %1 = mul i64 %0, 80
@@ -87,7 +87,7 @@ entry:
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 80
@@ -99,7 +99,7 @@ define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; SI: s_endpgm
-define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 9
@@ -114,7 +114,7 @@ define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %mul = mul i32 %a, %b
   store i32 %mul, i32 addrspace(1)* %out, align 4
   ret void
@@ -122,7 +122,7 @@ define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 
 ; FUNC-LABEL: {{^}}v_mul_i32:
 ; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -139,7 +139,7 @@ define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; crash with a 'failed to select' error.
 
 ; FUNC-LABEL: {{^}}s_mul_i64:
-define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %mul = mul i64 %a, %b
   store i64 %mul, i64 addrspace(1)* %out, align 8
   ret void
@@ -147,7 +147,7 @@ define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; FUNC-LABEL: {{^}}v_mul_i64:
 ; SI: v_mul_lo_i32
-define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %a, %b
@@ -157,7 +157,7 @@ define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr
 
 ; FUNC-LABEL: {{^}}mul32_in_branch:
 ; SI: s_mul_i32
-define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -180,7 +180,7 @@ endif:
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_u32
 ; SI: s_endpgm
-define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -224,7 +224,7 @@ endif:
 ; SI: s_mul_i32
 
 ; SI: buffer_store_dwordx4
-define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
+define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
   %mul = mul i128 %a, %b
   store i128 %mul, i128 addrspace(1)* %out
   ret void
@@ -234,26 +234,26 @@ define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
 ; SI: {{buffer|flat}}_load_dwordx4
 ; SI: {{buffer|flat}}_load_dwordx4
 
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
 ; SI: v_add_i32_e32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
 
 ; SI: {{buffer|flat}}_store_dwordx4
-define void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid
   %gep.b = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/mul_int24.ll b/test/CodeGen/AMDGPU/mul_int24.ll
index 6f7dfe2e13eb..3137569e9ca7 100644
--- a/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/test/CodeGen/AMDGPU/mul_int24.ll
@@ -13,7 +13,7 @@
 ; Make sure we are not masking the inputs
 ; CM-NOT: AND
 ; CM: MUL_INT24
-define void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %a.shl = shl i32 %a, 8
   %a.24 = ashr i32 %a.shl, 8
@@ -39,7 +39,7 @@ entry:
 ; CM: MULHI_INT24
 ; CM: MULHI_INT24
 ; CM: MULHI_INT24
-define void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %a.shl = shl i32 %a, 8
   %a.24 = ashr i32 %a.shl, 8
@@ -70,7 +70,7 @@ entry:
 ; GCN-DAG: v_mul_i32_i24_e32
 
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
@@ -87,7 +87,7 @@ define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; GCN-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
@@ -112,7 +112,7 @@ define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
+define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
 entry:
   %a.shl = shl i33 %a, 9
   %a.24 = ashr i33 %a.shl, 9
@@ -133,7 +133,7 @@ entry:
 ; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]],
 ; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; SI-NEXT: buffer_store_dword v[[HI]]
-define void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = ashr i33 %tmp0, 9
@@ -151,7 +151,7 @@ entry:
 ; GCN: v_mul_i32_i24_e32 v[[VAL_LO:[0-9]+]]
 ; GCN: v_mov_b32_e32 v[[VAL_HI:[0-9]+]], v[[VAL_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
+define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
 bb:
   %cmp = icmp eq i32 %arg0, 0
   br i1 %cmp, label %bb11, label %bb7
diff --git a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 004d36f00e51..59fdc8be5cea 100644
--- a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 
 ; FUNC-LABEL: {{^}}test_umul24_i32:
 ; GCN: v_mul_u32_u24
-define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -22,7 +22,7 @@ entry:
 ; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
 ; VI: s_mul_i32 [[SI_MUL:s[0-9]]], s{{[0-9]}}, s{{[0-9]}}
 ; VI: s_sext_i32_i16 s{{[0-9]}}, [[SI_MUL]]
-define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = sext i16 %mul to i32
@@ -34,7 +34,7 @@ entry:
 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
-define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
   %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@@ -54,7 +54,7 @@ define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)*
 ; VI: s_mul_i32
 ; VI: s_and_b32
 ; VI: v_mov_b32_e32
-define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = zext i16 %mul to i32
@@ -66,7 +66,7 @@ entry:
 ; SI: v_mul_u32_u24_e32
 ; SI: v_and_b32_e32
 ; VI: v_mul_lo_u16
-define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
   %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@@ -83,7 +83,7 @@ define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in)
 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
+define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -101,7 +101,7 @@ entry:
 ; GCN-NOT: and
 ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.24 = and i32 %a, 16777215
   %b.24 = and i32 %b, 16777215
@@ -118,7 +118,7 @@ entry:
 ; GCN-NOT: and
 ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %a.24 = and i64 %a, 16777215
   %b.24 = and i64 %b, 16777215
@@ -136,7 +136,7 @@ entry:
 ; GCN-DAG: v_mul_u32_u24_e32
 ; GCN-DAG: v_mul_hi_u32_u24_e32
 ; GCN: buffer_store_dwordx2
-define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a_24 = lshr i64 %tmp0, 40
@@ -152,7 +152,7 @@ entry:
 ; GCN-NOT: s_and_b32
 ; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
-define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a.24 = lshr i64 %tmp0, 40
@@ -166,7 +166,7 @@ entry:
 ; GCN: s_and_b32
 ; GCN: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]]
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]]
-define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.16 = and i32 %a, 65535
   %b.16 = and i32 %b, 65535
@@ -186,7 +186,7 @@ entry:
 ; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
 ; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}}
-define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = lshr i33 %tmp0, 9
@@ -206,7 +206,7 @@ entry:
 ; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
 ; GCN-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; GCN-NEXT: buffer_store_dword v[[HI]]
-define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = lshr i33 %tmp0, 9
diff --git a/test/CodeGen/AMDGPU/mul_uint24-r600.ll b/test/CodeGen/AMDGPU/mul_uint24-r600.ll
index da1c111fa5c0..0a646b7126d0 100644
--- a/test/CodeGen/AMDGPU/mul_uint24-r600.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24-r600.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}test_umul24_i32:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -19,7 +19,7 @@ entry:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
 ; EG: 16
-define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = sext i16 %mul to i32
@@ -31,7 +31,7 @@ entry:
 ; FUNC-LABEL: {{^}}test_umul24_i8:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
+define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
   %mul = mul i8 %a, %b
   %ext = sext i8 %mul to i32
@@ -41,7 +41,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
 ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.24 = and i32 %a, 16777215
   %b.24 = and i32 %b, 16777215
@@ -56,7 +56,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}test_umulhi24:
 ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %a.24 = and i64 %a, 16777215
   %b.24 = and i64 %b, 16777215
@@ -71,7 +71,7 @@ entry:
 ; FUNC-LABEL: {{^}}test_umul24_i64:
 ; EG; MUL_UINT24
 ; EG: MULHI
-define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a_24 = lshr i64 %tmp0, 40
diff --git a/test/CodeGen/AMDGPU/mulhu.ll b/test/CodeGen/AMDGPU/mulhu.ll
deleted file mode 100644
index 29b0944a5533..000000000000
--- a/test/CodeGen/AMDGPU/mulhu.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
-;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
-;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-
-define void @test(i32 %p) {
-   %i = udiv i32 %p, 3
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
new file mode 100644
index 000000000000..9d0b6b395996
--- /dev/null
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -0,0 +1,710 @@
+; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Add an extra verifier runs. There were some cases where invalid IR
+; was produced but happened to be fixed by the later passes.
+
+; Make sure divergent control flow with multiple exits from a region
+; is properly handled. UnifyFunctionExitNodes should be run before
+; StructurizeCFG.
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %2 = extractvalue { i1, i64 } %1, 0
+; IR: %3 = extractvalue { i1, i64 } %1, 1
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %7 = extractvalue { i1, i64 } %6, 0
+; IR: %8 = extractvalue { i1, i64 } %6, 1
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: LeafBlock:
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: br label %Flow{{$}}
+
+; IR:  Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: %13 = extractvalue { i1, i64 } %12, 0
+; IR: %14 = extractvalue { i1, i64 } %12, 1
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR:  br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+
+
+; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
+; GCN: v_cmp_lt_i32_e32 vcc, 1
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+
+
+; FIXME: Why is this compare essentially repeated?
+; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
+; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
+
+; GCN: ; %Flow1
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+
+; GCN: ; %exit1
+; GCN: ds_write_b32
+
+; GCN: %Flow2
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_and_saveexec_b64
+; GCN-NEXT: s_xor_b64
+
+; GCN: ; %exit0
+; GCN: buffer_store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
+
+
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: unreachable
+
+
+; FIXME: Probably should insert an s_endpgm anyway.
+; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
+; GCN: ; %UnifiedUnreachableBlock
+; GCN-NEXT: .Lfunc_end
+define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  unreachable
+}
+
+; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
+; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
+; IR: llvm.amdgcn.if
+; IR: br i1
+
+; IR: {{^}}Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: {{^}}LeafBlock:
+; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
+; IR: %9 = xor i1 %divergent.cond1, true
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
+; IR: %10 = xor i1 %uniform.cond0, true
+; IR: br label %Flow
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: {{^}}Flow1:
+; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %divergent.cond0 = icmp slt i32 %tmp16, 2
+  br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %divergent.cond1 = icmp eq i32 %tmp16, 1
+  br i1 %divergent.cond1, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %uniform.cond0 = icmp eq i32 %arg3, 2
+  br i1 %uniform.cond0, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+
+define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %arg3, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
+; IR: Flow2:
+; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
+; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+
+; IR: UnifiedReturnBlock:
+; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: ret float %UnifiedRetVal
+define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
+entry:
+  %Pivot = icmp slt i32 %vgpr, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %vgpr, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %vgpr, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 9, i32 addrspace(1)* undef
+  ret float 1.0
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 17, i32 addrspace(3)* undef
+  ret float 2.0
+}
+
+; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
+
+; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
+; GCN: s_cmp_gt_i32 s0, 1
+; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
+
+; GCN: {{^}}[[FLOW]]:
+; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
+
+; GCN: v_mov_b32_e32 v0, 2.0
+; GCN: s_or_b64 exec, exec
+; GCN: s_and_b64 exec, exec
+; GCN: v_mov_b32_e32 v0, 1.0
+
+; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+
+define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
+entry:
+  %uniform.cond = icmp slt i32 %sgpr, 2
+  br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %divergent.cond0 = icmp eq i32 %vgpr, 3
+  br i1 %divergent.cond0, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %divergent.cond1 = icmp eq i32 %vgpr, 7
+  br i1 %divergent.cond1, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 9, i32 addrspace(1)* undef
+  ret float 1.0
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 17, i32 addrspace(3)* undef
+  ret float 2.0
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; The non-uniformity of the branch to the exiting blocks requires
+; looking at transitive predecessors.
+
+; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
+
+; IR: exit0:                                            ; preds = %Flow2
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+
+; IR: indirect.exit1:
+; IR: %load = load volatile i32, i32 addrspace(1)* undef
+; IR: store volatile i32 %load, i32 addrspace(1)* undef
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+indirect.exit1:
+  %load = load volatile i32, i32 addrspace(1)* undef
+  store volatile i32 %load, i32 addrspace(1)* undef
+  br label %exit1
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_switch(
+define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  switch i32 %tmp16, label %exit1
+    [ i32 1, label %LeafBlock
+      i32 2, label %LeafBlock1
+      i32 3, label %exit0 ]
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
+
+divergent.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
+
+divergent.if:
+  %vgpr0 = load volatile float, float addrspace(1)* undef
+  %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
+  br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
+
+divergent.then:
+  %vgpr1 = load volatile float, float addrspace(1)* undef
+  %divergent.cond2 = fcmp olt float %vgpr1, 4.0
+  store volatile i32 33, i32 addrspace(1)* undef
+  br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
+
+divergent.endif:
+  store volatile i32 38, i32 addrspace(1)* undef
+  br label %divergent.ret0
+
+divergent.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
+; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
+; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %8, label %uniform.if, label %Flow2
+
+; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
+; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
+; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
+
+; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
+; IR-NEXT: ret void
+define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
+entry:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
+
+uniform.multi.exit.region:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
+
+uniform.if:
+  %sgpr0 = load volatile i32, i32 addrspace(2)* undef
+  %uniform.cond1 = icmp slt i32 %sgpr0, 1
+  br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
+
+uniform.then:
+  %sgpr1 = load volatile i32, i32 addrspace(2)* undef
+  %uniform.cond2 = icmp sge i32 %sgpr1, 4
+  store volatile i32 33, i32 addrspace(1)* undef
+  br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
+
+uniform.endif:
+  store volatile i32 38, i32 addrspace(1)* undef
+  br label %uniform.ret0
+
+uniform.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_unreachable_exit(
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  switch i32 %tmp, label %bb3 [
+    i32 2, label %bb1
+    i32 0, label %bb2
+  ]
+
+bb1:                                              ; preds = %bb
+  unreachable
+
+bb2:                                              ; preds = %bb
+  unreachable
+
+bb3:                                              ; preds = %bb
+  switch i32 undef, label %bb5 [
+    i32 2, label %bb4
+  ]
+
+bb4:                                              ; preds = %bb3
+  ret void
+
+bb5:                                              ; preds = %bb3
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll
index 95c7ce862329..15de689b953e 100644
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -64,7 +64,7 @@ ENDIF:                                            ; preds = %LOOP
   br i1 %tmp51, label %LOOP, label %LOOP.outer
 }
 
-; OPT-LABEL: define void @multi_if_break_loop(
+; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
 ; OPT: llvm.amdgcn.break
 ; OPT: llvm.amdgcn.loop
 ; OPT: llvm.amdgcn.if.break
@@ -79,7 +79,7 @@ ENDIF:                                            ; preds = %LOOP
 ; Uses a copy intsead of an or
 ; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]]
 ; GCN: s_or_b64 [[BREAK_REG]], exec, [[COPY]]
-define void @multi_if_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
new file mode 100644
index 000000000000..672549c8ea63
--- /dev/null
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -0,0 +1,269 @@
+; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; After structurizing, there are 3 levels of loops. The i1 phi
+; conditions mutually depend on each other, so it isn't safe to delete
+; the condition that appears to have no uses until the loop is
+; completely processed.
+
+
+; IR-LABEL: @reduced_nested_loop_conditions(
+
+; IR: bb5:
+; IR-NEXT: %phi.broken = phi i64 [ %loop.phi, %bb10 ], [ 0, %bb ]
+; IR-NEXT: %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
+; IR-NEXT: %tmp7 = icmp eq i32 %tmp6, 1
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp7)
+; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
+; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
+; IR-NEXT: br i1 %1, label %bb8, label %Flow
+
+; IR: bb8:
+; IR-NEXT: %3 = call i64 @llvm.amdgcn.break(i64 %phi.broken)
+; IR-NEXT: br label %bb13
+
+; IR: bb10:
+; IR-NEXT: %loop.phi = phi i64 [ %6, %Flow ]
+; IR-NEXT: %tmp11 = phi i32 [ %5, %Flow ]
+; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; IR-NEXT: br i1 %4, label %bb23, label %bb5
+
+; IR: Flow:
+; IR-NEXT: %loop.phi1 = phi i64 [ %loop.phi2, %bb4 ], [ %phi.broken, %bb5 ]
+; IR-NEXT: %5 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
+; IR-NEXT: %6 = call i64 @llvm.amdgcn.else.break(i64 %2, i64 %loop.phi1)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %2)
+; IR-NEXT: br label %bb10
+
+; IR: bb13:
+; IR-NEXT: %loop.phi3 = phi i64 [ %loop.phi4, %bb3 ], [ %3, %bb8 ]
+; IR-NEXT: %tmp14 = phi i1 [ false, %bb3 ], [ true, %bb8 ]
+; IR-NEXT: %tmp15 = bitcast i64 %tmp2 to <2 x i32>
+; IR-NEXT: br i1 %tmp14, label %bb16, label %bb20
+
+; IR: bb16:
+; IR-NEXT: %tmp17 = extractelement <2 x i32> %tmp15, i64 1
+; IR-NEXT: %tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17
+; IR-NEXT: %tmp19 = load volatile i32, i32 addrspace(3)* %tmp18
+; IR-NEXT: br label %bb20
+
+; IR: bb20:
+; IR-NEXT: %loop.phi4 = phi i64 [ %phi.broken, %bb16 ], [ %phi.broken, %bb13 ]
+; IR-NEXT: %loop.phi2 = phi i64 [ %phi.broken, %bb16 ], [ %loop.phi3, %bb13 ]
+; IR-NEXT: %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
+; IR-NEXT: br label %bb9
+
+; IR: bb23:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; IR-NEXT: ret void
+
+; GCN-LABEL: {{^}}reduced_nested_loop_conditions:
+
+; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cbranch_scc1
+
+; FIXME: Should fold to unconditional branch?
+; GCN: s_mov_b64 vcc, -1
+; GCN-NEXT: ; implicit-def
+; GCN: s_cbranch_vccz
+
+; GCN: ds_read_b32
+
+; GCN: [[BB9:BB[0-9]+_[0-9]+]]: ; %bb9
+; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: s_branch [[BB9]]
+define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* nocapture %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %tmp
+  %tmp2 = load volatile i64, i64 addrspace(3)* %tmp1
+  br label %bb5
+
+bb3:                                              ; preds = %bb9
+  br i1 true, label %bb4, label %bb13
+
+bb4:                                              ; preds = %bb3
+  br label %bb10
+
+bb5:                                              ; preds = %bb10, %bb
+  %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
+  %tmp7 = icmp eq i32 %tmp6, 1
+  br i1 %tmp7, label %bb8, label %bb10
+
+bb8:                                              ; preds = %bb5
+  br label %bb13
+
+bb9:                                              ; preds = %bb20, %bb9
+  br i1 false, label %bb3, label %bb9
+
+bb10:                                             ; preds = %bb5, %bb4
+  %tmp11 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
+  %tmp12 = phi i1 [ %tmp22, %bb4 ], [ true, %bb5 ]
+  br i1 %tmp12, label %bb23, label %bb5
+
+bb13:                                             ; preds = %bb8, %bb3
+  %tmp14 = phi i1 [ %tmp22, %bb3 ], [ true, %bb8 ]
+  %tmp15 = bitcast i64 %tmp2 to <2 x i32>
+  br i1 %tmp14, label %bb16, label %bb20
+
+bb16:                                             ; preds = %bb13
+  %tmp17 = extractelement <2 x i32> %tmp15, i64 1
+  %tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17
+  %tmp19 = load volatile i32, i32 addrspace(3)* %tmp18
+  br label %bb20
+
+bb20:                                             ; preds = %bb16, %bb13
+  %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
+  %tmp22 = phi i1 [ false, %bb16 ], [ %tmp14, %bb13 ]
+  br label %bb9
+
+bb23:                                             ; preds = %bb10
+  ret void
+}
+
+; Earlier version of above, before a run of the structurizer.
+; IR-LABEL: @nested_loop_conditions(
+
+; IR: Flow7:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
+; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
+; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
+; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
+
+; IR: Flow1:
+; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
+; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
+; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
+; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
+; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
+; IR-NEXT: br i1 %18, label %Flow7, label %bb14
+
+; IR: Flow2:
+; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
+; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
+; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
+; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
+; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
+; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
+; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
+; IR-NEXT: br i1 %25, label %bb21, label %Flow3
+
+; IR: bb21:
+; IR: %tmp12 = icmp slt i32 %tmp11, 9
+; IR-NEXT: %27 = xor i1 %tmp12, true
+; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
+; IR-NEXT: br label %Flow3
+
+; IR: Flow3:
+; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
+; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
+; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
+; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
+; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
+; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
+
+; IR: bb31:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
+; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
+; IR-NEXT: ret void
+
+
+; GCN-LABEL: {{^}}nested_loop_conditions:
+
+; GCN: v_cmp_lt_i32_e32 vcc, 8, v
+; GCN: s_and_b64 vcc, exec, vcc
+; GCN: s_cbranch_vccnz [[BB31:BB[0-9]+_[0-9]+]]
+
+; GCN: [[BB14:BB[0-9]+_[0-9]+]]: ; %bb14
+; GCN: v_cmp_ne_u32_e32 vcc, 1, v
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz [[BB31]]
+
+; GCN: [[BB18:BB[0-9]+_[0-9]+]]: ; %bb18
+; GCN: buffer_load_dword
+; GCN: v_cmp_lt_i32_e32 vcc, 8, v
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz [[BB18]]
+
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: v_cmp_gt_i32_e32 vcc, 9
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz [[BB14]]
+
+; GCN: [[BB31]]:
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp1
+  %tmp3 = load i64, i64 addrspace(1)* %tmp2, align 16
+  %tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
+  %tmp1033 = extractelement <4 x i32> %tmp932, i64 0
+  %tmp1134 = load volatile i32, i32 addrspace(1)* undef
+  %tmp1235 = icmp slt i32 %tmp1134, 9
+  br i1 %tmp1235, label %bb14.lr.ph, label %bb13
+
+bb14.lr.ph:                                       ; preds = %bb
+  br label %bb14
+
+bb4.bb13_crit_edge:                               ; preds = %bb21
+  br label %bb13
+
+bb13:                                             ; preds = %bb4.bb13_crit_edge, %bb
+  br label %bb31
+
+bb14:                                             ; preds = %bb21, %bb14.lr.ph
+  %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %tmp10, %bb21 ]
+  %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %tmp9, %bb21 ]
+  %tmp15 = icmp eq i32 %tmp1037, 1
+  br i1 %tmp15, label %bb16, label %bb31.loopexit
+
+bb16:                                             ; preds = %bb14
+  %tmp17 = bitcast i64 %tmp3 to <2 x i32>
+  br label %bb18
+
+bb18:                                             ; preds = %bb18, %bb16
+  %tmp19 = load volatile i32, i32 addrspace(1)* undef
+  %tmp20 = icmp slt i32 %tmp19, 9
+  br i1 %tmp20, label %bb21, label %bb18
+
+bb21:                                             ; preds = %bb18
+  %tmp22 = extractelement <2 x i32> %tmp17, i64 1
+  %tmp23 = lshr i32 %tmp22, 16
+  %tmp24 = select i1 undef, i32 undef, i32 %tmp23
+  %tmp25 = uitofp i32 %tmp24 to float
+  %tmp26 = fmul float %tmp25, 0x3EF0001000000000
+  %tmp27 = fsub float %tmp26, undef
+  %tmp28 = fcmp olt float %tmp27, 5.000000e-01
+  %tmp29 = select i1 %tmp28, i64 1, i64 2
+  %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
+  %tmp7 = zext i32 %tmp30 to i64
+  %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
+  %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
+  %tmp10 = extractelement <4 x i32> %tmp9, i64 0
+  %tmp11 = load volatile i32, i32 addrspace(1)* undef
+  %tmp12 = icmp slt i32 %tmp11, 9
+  br i1 %tmp12, label %bb14, label %bb4.bb13_crit_edge
+
+bb31.loopexit:                                    ; preds = %bb14
+  br label %bb31
+
+bb31:                                             ; preds = %bb31.loopexit, %bb13
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
index 9dd99efd997c..97dc67f82607 100644
--- a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
+++ b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@@ -9,7 +9,7 @@
 @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
 
 ; CHECK-DAG: Name: load_extern_const_init
-define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@ define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
 @undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
 
 ; CHECK-DAG: Name: undef_const_addrspace
-define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index fd66b0b5d1f6..8a7bf6db5b8d 100644
--- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
 ; SI: s_load_dword s
 ; SI: buffer_store_short v
-define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i16
   store i16 %trunc, i16 addrspace(1)* %out
   ret void
@@ -21,7 +21,7 @@ define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounw
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_short v
-define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
@@ -34,7 +34,7 @@ define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspa
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -56,7 +56,7 @@ define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i1
   store i1 %trunc, i1 addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
@@ -78,7 +78,7 @@ define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i32
   store i32 %trunc, i32 addrspace(1)* %out
   ret void
@@ -87,7 +87,7 @@ define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounw
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -100,7 +100,7 @@ define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspa
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -110,7 +110,7 @@ define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -125,7 +125,7 @@ define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)
 ; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
   %trunc = trunc i16 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -134,7 +134,7 @@ define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
 ; SI: buffer_load_ubyte v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -147,7 +147,7 @@ define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i8
   store i8 %trunc, i8 addrspace(1)* %out
@@ -157,7 +157,7 @@ define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -171,7 +171,7 @@ define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)*
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -180,7 +180,7 @@ define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -194,7 +194,7 @@ define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace
 ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
-define void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %val = load i32, i32 addrspace(2)* %in
   %mask = and i32 %val, 65535
@@ -205,7 +205,7 @@ entry:
 ; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %bc = bitcast <2 x i32> %ld to i64
   %hi = lshr i64 %bc, 32
diff --git a/test/CodeGen/AMDGPU/nop-data.ll b/test/CodeGen/AMDGPU/nop-data.ll
new file mode 100644
index 000000000000..b68f343097e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/nop-data.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s
+
+; CHECK: kernel0:
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @kernel0() align 256 {
+entry:
+  ret void
+}
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0  // 0000000001FC: BF800000
+
+; CHECK-NEXT: {{^$}}
+; CHECK-NEXT: kernel1:
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @kernel1(i32 addrspace(1)* addrspace(2)* %ptr.out) align 256 {
+entry:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/nullptr.ll b/test/CodeGen/AMDGPU/nullptr.ll
new file mode 100644
index 000000000000..0df16da13562
--- /dev/null
+++ b/test/CodeGen/AMDGPU/nullptr.ll
@@ -0,0 +1,113 @@
+;RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s
+
+%struct.S = type { i32*, i32 addrspace(1)*, i32 addrspace(2)*, i32 addrspace(3)*, i32 addrspace(4)*, i32 addrspace(5)*}
+
+; CHECK-LABEL: nullptr_priv:
+; CHECK-NEXT: .long 0
+@nullptr_priv = global i32* addrspacecast (i32 addrspace(4)* null to i32*)
+
+; CHECK-LABEL: nullptr_glob:
+; CHECK-NEXT: .quad 0
+@nullptr_glob = global i32 addrspace(1)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(1)*)
+
+; CHECK-LABEL: nullptr_const:
+; CHECK-NEXT: .quad 0
+@nullptr_const = global i32 addrspace(2)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(2)*)
+
+; CHECK-LABEL: nullptr_local:
+; CHECK-NEXT: .long -1
+@nullptr_local = global i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+
+; CHECK-LABEL: nullptr_region:
+; CHECK-NEXT: .long -1
+@nullptr_region = global i32 addrspace(5)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(5)*)
+
+; CHECK-LABEL: nullptr6:
+; CHECK-NEXT: .long 0
+@nullptr6 = global i32 addrspace(6)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(6)*)
+
+; CHECK-LABEL: nullptr7:
+; CHECK-NEXT: .long 0
+@nullptr7 = global i32 addrspace(7)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(7)*)
+
+; CHECK-LABEL: nullptr8:
+; CHECK-NEXT: .long 0
+@nullptr8 = global i32 addrspace(8)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(8)*)
+
+; CHECK-LABEL: nullptr9:
+; CHECK-NEXT: .long 0
+@nullptr9 = global i32 addrspace(9)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(9)*)
+
+; CHECK-LABEL: nullptr10:
+; CHECK-NEXT: .long 0
+@nullptr10 = global i32 addrspace(10)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(10)*)
+
+; CHECK-LABEL: nullptr11:
+; CHECK-NEXT: .long 0
+@nullptr11 = global i32 addrspace(11)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(11)*)
+
+; CHECK-LABEL: nullptr12:
+; CHECK-NEXT: .long 0
+@nullptr12 = global i32 addrspace(12)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(12)*)
+
+; CHECK-LABEL: nullptr13:
+; CHECK-NEXT: .long 0
+@nullptr13 = global i32 addrspace(13)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(13)*)
+
+; CHECK-LABEL: nullptr14:
+; CHECK-NEXT: .long 0
+@nullptr14 = global i32 addrspace(14)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(14)*)
+
+; CHECK-LABEL: nullptr15:
+; CHECK-NEXT: .long 0
+@nullptr15 = global i32 addrspace(15)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(15)*)
+
+; CHECK-LABEL: nullptr16:
+; CHECK-NEXT: .long 0
+@nullptr16 = global i32 addrspace(16)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(16)*)
+
+; CHECK-LABEL: nullptr17:
+; CHECK-NEXT: .long 0
+@nullptr17 = global i32 addrspace(17)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(17)*)
+
+; CHECK-LABEL: nullptr18:
+; CHECK-NEXT: .long 0
+@nullptr18 = global i32 addrspace(18)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(18)*)
+
+; CHECK-LABEL: nullptr19:
+; CHECK-NEXT: .long 0
+@nullptr19 = global i32 addrspace(19)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(19)*)
+
+; CHECK-LABEL: nullptr20:
+; CHECK-NEXT: .long 0
+@nullptr20 = global i32 addrspace(20)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(20)*)
+
+; CHECK-LABEL: nullptr21:
+; CHECK-NEXT: .long 0
+@nullptr21 = global i32 addrspace(21)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(21)*)
+
+; CHECK-LABEL: nullptr22:
+; CHECK-NEXT: .long 0
+@nullptr22 = global i32 addrspace(22)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(22)*)
+
+; CHECK-LABEL: nullptr23:
+; CHECK-NEXT: .long 0
+@nullptr23 = global i32 addrspace(23)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(23)*)
+
+; CHECK-LABEL: structWithPointers:
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .long -1
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .long -1
+; CHECK-NEXT: .zero 4
+@structWithPointers = addrspace(1) global %struct.S {
+  i32* addrspacecast (i32 addrspace(4)* null to i32*),
+  i32 addrspace(1)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(1)*),
+  i32 addrspace(2)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(2)*),
+  i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*),
+  i32 addrspace(4)* null,
+  i32 addrspace(5)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(5)*)}, align 4
diff --git a/test/CodeGen/AMDGPU/omod.ll b/test/CodeGen/AMDGPU/omod.ll
new file mode 100644
index 000000000000..3fd7b13fcc58
--- /dev/null
+++ b/test/CodeGen/AMDGPU/omod.ll
@@ -0,0 +1,297 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; IEEE bit enabled for compute kernel, no shouldn't use.
+; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* %out.gep
+  ret void
+}
+
+; IEEE bit enabled for compute kernel, no shouldn't use even though nsz is allowed
+; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* %out.gep
+  ret void
+}
+
+; Only allow without IEEE bit if signed zeros are significant.
+; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_div2_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}}
+define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mul2_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}}
+define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 2.0
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mul4_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
+define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 4.0
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 4.0
+  store float %div2, float addrspace(1)* undef
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mul4_dbg_use_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
+define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
+  %div2 = fmul float %add, 4.0
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; Clamp is applied after omod, folding both into instruction is OK.
+; GCN-LABEL: {{^}}v_clamp_omod_div2_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 clamp div:2{{$}}
+define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+
+  %max = call float @llvm.maxnum.f32(float %div2, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* undef
+  ret void
+}
+
+; Cannot fold omod into clamp
+; GCN-LABEL: {{^}}v_omod_div2_clamp_f32:
+; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 clamp{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  %div2 = fmul float %clamp, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_div2_abs_src_f32:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ADD]]|, 0.5{{$}}
+define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %abs.add = call float @llvm.fabs.f32(float %add)
+  %div2 = fmul float %abs.add, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_self_clamp_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, v0 clamp{{$}}
+define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
+  %add = fadd float %a, %a
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_clamp_self_f32:
+; GCN: v_max_f32_e64 [[CLAMP:v[0-9]+]], v0, v0 clamp{{$}}
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[CLAMP]], [[CLAMP]]{{$}}
+define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  %add = fadd float %clamp, %clamp
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_abs_self_f32:
+; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, |[[X]]|{{$}}
+define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
+  %x = fadd float %a, 1.0
+  %abs.x = call float @llvm.fabs.f32(float %x)
+  %add = fadd float %abs.x, %abs.x
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_abs_x_x_f32:
+
+; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[X]]{{$}}
+define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
+  %x = fadd float %a, 1.0
+  %abs.x = call float @llvm.fabs.f32(float %x)
+  %add = fadd float %abs.x, %x
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_x_abs_x_f32:
+; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
+; GCN: v_add_f32_e64 v{{[0-9]+}}, [[X]], |[[X]]|{{$}}
+define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
+  %x = fadd float %a, 1.0
+  %abs.x = call float @llvm.fabs.f32(float %x)
+  %add = fadd float %x, %abs.x
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod into omod into another omod.
+; GCN-LABEL: {{^}}v_omod_div2_omod_div2_f32:
+; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2.0 = fmul float %add, 0.5
+  %div2.1 = fmul float %div2.0, 0.5
+  store float %div2.1, float addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod if denorms enabled
+; GCN-LABEL: {{^}}v_omod_div2_f32_denormals:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod if denorms enabled for add form.
+; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
+  %add = fadd float %a, 1.0
+  %mul2 = fadd float %add, %add
+  store float %mul2, float addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod if denorms enabled
+; GCN-LABEL: {{^}}v_omod_div2_f16_denormals:
+; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; VI: v_mul_f16_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
+  %add = fadd half %a, 1.0
+  %div2 = fmul half %add, 0.5
+  store half %div2, half addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod if denorms enabled for add form.
+; GCN-LABEL: {{^}}v_omod_mul2_f16_denormals:
+; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; VI: v_add_f16_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
+  %add = fadd half %a, 1.0
+  %mul2 = fadd half %add, %add
+  store half %mul2, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_div2_f16_no_denormals:
+; VI-NOT: v0
+; VI: v_add_f16_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}}
+define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
+  %add = fadd half %a, 1.0
+  %div2 = fmul half %add, 0.5
+  store half %div2, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mac_to_mad:
+; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} mul:2{{$}}
+define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
+  %mul = fmul float %a, %a
+  %add = fadd float %mul, %b
+  %mad = fmul float %add, 2.0
+  %res = fmul float %mad, %b
+  store float %res, float addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.minnum.f64(double, double) #1
+declare double @llvm.maxnum.f64(double, double) #1
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.minnum.f16(half, half) #1
+declare half @llvm.maxnum.f16(half, half) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-features"="+fp32-denormals" "no-signed-zeros-fp-math"="true" }
+attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" "no-signed-zeros-fp-math"="true" }
+attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8}
+!8 = !DIBasicType(name: "float", size: 32, align: 32)
+!9 = !DIExpression()
+!10 = !DILocation(line: 1, column: 42, scope: !5)
diff --git a/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
index 0242f6d6145a..c974471c6573 100644
--- a/test/CodeGen/AMDGPU/opencl-image-metadata.ll
+++ b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
@@ -6,7 +6,7 @@
 
 ; EG: CF_END
 ; SI: s_endpgm
-define void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll
index 4e5ea4b86b77..3836a2b7e599 100644
--- a/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/test/CodeGen/AMDGPU/operand-folding.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: {{^}}fold_sgpr:
 ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
-define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
+define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
 entry:
   %tmp0 = icmp ne i32 %fold, 0
   br i1 %tmp0, label %if, label %endif
@@ -20,7 +20,7 @@ endif:
 
 ; CHECK-LABEL: {{^}}fold_imm:
 ; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
-define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
+define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
 entry:
   %fold = add i32 3, 2
   %tmp0 = icmp ne i32 %cmp, 0
@@ -46,7 +46,7 @@ endif:
 ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
 
-define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
+define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
 entry:
   %tmp0 = add i64 %val, 1
   store i64 %tmp0, i64 addrspace(1)* %out
@@ -61,7 +61,7 @@ entry:
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 
-define void @vector_inline(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -80,7 +80,7 @@ entry:
 ; CHECK-LABEL: {{^}}imm_one_use:
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
 
-define void @imm_one_use(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = xor i32 %tmp0, 100
@@ -94,7 +94,7 @@ entry:
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
 
-define void @vector_imm(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -114,7 +114,7 @@ entry:
 ; CHECK: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]
 ; CHECK: buffer_store_dword v[[LO]]
-define void @no_fold_tied_subregister() {
+define amdgpu_kernel void @no_fold_tied_subregister() {
   %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
   %tmp2 = extractelement <2 x float> %tmp1, i32 0
   %tmp3 = extractelement <2 x float> %tmp1, i32 1
diff --git a/test/CodeGen/AMDGPU/operand-spacing.ll b/test/CodeGen/AMDGPU/operand-spacing.ll
index 127f3da220e7..fc6f070b737a 100644
--- a/test/CodeGen/AMDGPU/operand-spacing.ll
+++ b/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -11,7 +11,7 @@
 ; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) {
   %result = fadd float %a, %b
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
index 4584802ad5a7..2de6b59e59e9 100644
--- a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
+++ b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
@@ -3,7 +3,7 @@
 --- |
   target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
-  define void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 {
   main_body:
     %id = call i32 @llvm.amdgcn.workitem.id.x()
     %cc = icmp eq i32 %id, 0
@@ -23,7 +23,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -34,7 +34,7 @@
     ret void
   }
 
-  define void @optimize_if_or_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -46,7 +46,7 @@
   }
 
 
-  define void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 {
   main_body:
     %id = call i32 @llvm.amdgcn.workitem.id.x()
     %cc = icmp eq i32 %id, 0
@@ -67,7 +67,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -78,7 +78,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -89,7 +89,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -100,7 +100,7 @@
     ret void
   }
 
-  define void @optimize_if_unknown_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -111,7 +111,7 @@
     ret void
   }
 
-  define void @optimize_if_andn2_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -122,7 +122,7 @@
     ret void
   }
 
-  define void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index eca6909d4eb9..eb082843fb82 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -9,7 +9,7 @@
 
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -28,7 +28,7 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in)
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -39,7 +39,7 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in)
 
 ; FUNC-LABEL: {{^}}scalar_or_i32:
 ; SI: s_or_b32
-define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %or = or i32 %a, %b
   store i32 %or, i32 addrspace(1)* %out
   ret void
@@ -47,7 +47,7 @@ define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 
 ; FUNC-LABEL: {{^}}vector_or_i32:
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
+define amdgpu_kernel void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
   %loada = load i32, i32 addrspace(1)* %a
   %or = or i32 %loada, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b)
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_i32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
-define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
   %or = or i32 %a, 99999
   store i32 %or, i32 addrspace(1)* %out, align 4
   ret void
@@ -68,7 +68,7 @@ define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
 ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@ define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -101,7 +101,7 @@ define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: or_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -111,7 +111,7 @@ define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI-NOT: or_b32
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63
 ; SI-NOT: or_b32
-define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   %foo = add i64 %b, 63
@@ -125,7 +125,7 @@ define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a,
 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -133,7 +133,7 @@ define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; FUNC-LABEL: {{^}}vector_or_literal_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 65535
   store i32 %or, i32 addrspace(1)* %out, align 4
@@ -142,7 +142,7 @@ define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a,
 
 ; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
-define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 4
   store i32 %or, i32 addrspace(1)* %out, align 4
@@ -154,7 +154,7 @@ define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspac
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
 
 ; SI: s_or_b64
-define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, %b
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -163,7 +163,7 @@ define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; FUNC-LABEL: {{^}}vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %loadb = load i64, i64 addrspace(1)* %b, align 8
   %or = or i64 %loada, %loadb
@@ -174,7 +174,7 @@ define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; FUNC-LABEL: {{^}}scalar_vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
+define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
   %loada = load i64, i64 addrspace(1)* %a
   %or = or i64 %loada, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -186,7 +186,7 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out
@@ -200,7 +200,7 @@ define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 ; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}}
 ; SI: s_endpgm
-define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 8
   store i64 %or, i64 addrspace(1)* %out
@@ -213,7 +213,7 @@ define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64
 ; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}}
 ; SI: s_endpgm
-define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -226,7 +226,7 @@ define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]]
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, -200
   store i64 %or, i64 addrspace(1)* %out
@@ -239,7 +239,7 @@ define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = or i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -250,7 +250,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1)* %in0
   %b = load float, float addrspace(1)* %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -263,7 +263,7 @@ define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrs
 
 ; FUNC-LABEL: {{^}}s_or_i1:
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %cmp0 = icmp eq i32 %a, %b
   %cmp1 = icmp eq i32 %c, %d
   %or = or i1 %cmp0, %cmp1
diff --git a/test/CodeGen/AMDGPU/over-max-lds-size.ll b/test/CodeGen/AMDGPU/over-max-lds-size.ll
index 32ad9aba04ed..57777e783c56 100644
--- a/test/CodeGen/AMDGPU/over-max-lds-size.ll
+++ b/test/CodeGen/AMDGPU/over-max-lds-size.ll
@@ -6,7 +6,7 @@
 
 @huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
 
-define void @use_huge_lds() {
+define amdgpu_kernel void @use_huge_lds() {
 entry:
   %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
   store i32 0, i32 addrspace(3)* %v0
diff --git a/test/CodeGen/AMDGPU/pack.v2f16.ll b/test/CodeGen/AMDGPU/pack.v2f16.ll
new file mode 100644
index 000000000000..5a07f7ca6ae8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -0,0 +1,219 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+
+
+; GCN-LABEL: {{^}}s_pack_v2f16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
+  %val0 = load volatile i32, i32 addrspace(2)* %in0
+  %val1 = load volatile i32, i32 addrspace(2)* %in1
+  %lo.i = trunc i32 %val0 to i16
+  %hi.i = trunc i32 %val1 to i16
+  %lo = bitcast i16 %lo.i to half
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_pack_v2f16_imm_lo:
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 {
+  %val1 = load i32, i32 addrspace(2)* %in1
+  %hi.i = trunc i32 %val1 to i16
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_pack_v2f16_imm_hi:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 {
+  %val0 = load i32, i32 addrspace(2)* %in0
+  %lo.i = trunc i32 %val0 to i16
+  %lo = bitcast i16 %lo.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16:
+; GFX9: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+
+; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %lo.i = trunc i32 %val0 to i16
+  %hi.i = trunc i32 %val1 to i16
+  %lo = bitcast i16 %lo.i to half
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_user:
+; GFX9: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+
+; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
+
+; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %lo.i = trunc i32 %val0 to i16
+  %hi.i = trunc i32 %val1 to i16
+  %lo = bitcast i16 %lo.i to half
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  %foo = add i32 %vec.i32, 9
+  store volatile i32 %foo, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_imm_lo:
+; GFX9-DAG: flat_load_dword [[VAL1:v[0-9]+]]
+
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234{{$}}
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %hi.i = trunc i32 %val1 to i16
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_lo:
+; GFX9-DAG: flat_load_dword [[VAL1:v[0-9]+]]
+
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4400{{$}}
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %hi.i = trunc i32 %val1 to i16
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half 4.0, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_imm_hi:
+; GFX9-DAG: flat_load_dword [[VAL0:v[0-9]+]]
+
+; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
+; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo.i = trunc i32 %val0 to i16
+  %lo = bitcast i16 %lo.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_inline_f16imm_hi:
+; GFX9-DAG: flat_load_dword [[VAL:v[0-9]+]]
+
+; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3c00
+; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]]
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo.i = trunc i32 %val0 to i16
+  %lo = bitcast i16 %lo.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half 1.0, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_hi:
+; GFX9: flat_load_dword [[VAL:v[0-9]+]]
+
+; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]]
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], 64, 16, [[MASKED]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo.i = trunc i32 %val0 to i16
+  %lo = bitcast i16 %lo.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half 0xH0040, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/pack.v2i16.ll b/test/CodeGen/AMDGPU/pack.v2i16.ll
new file mode 100644
index 000000000000..8515fbc6dbae
--- /dev/null
+++ b/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -0,0 +1,181 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+
+
+; GCN-LABEL: {{^}}s_pack_v2i16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
+  %val0 = load volatile i32, i32 addrspace(2)* %in0
+  %val1 = load volatile i32, i32 addrspace(2)* %in1
+  %lo = trunc i32 %val0 to i16
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_pack_v2i16_imm_lo:
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 {
+  %val1 = load i32, i32 addrspace(2)* %in1
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_pack_v2i16_imm_hi:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 {
+  %val0 = load i32, i32 addrspace(2)* %in0
+  %lo = trunc i32 %val0 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16:
+; GFX9: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+
+; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %lo = trunc i32 %val0 to i16
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_user:
+; GFX9: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+
+; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
+
+; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %lo = trunc i32 %val0 to i16
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  %foo = add i32 %vec.i32, 9
+  store volatile i32 %foo, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_imm_lo:
+; GFX9-DAG: flat_load_dword [[VAL1:v[0-9]+]]
+; GFX9-DENORM-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}}
+
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b{{$}}
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 123, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_lo:
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, 64
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 64, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_imm_hi:
+; GFX9-DAG: flat_load_dword [[VAL0:v[0-9]+]]
+
+; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}}
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[VAL0]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo = trunc i32 %val0 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 123, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_hi:
+; GFX9: flat_load_dword [[VAL:v[0-9]+]]
+; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], 7, 16, [[VAL0]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo = trunc i32 %val0 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 7, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/packetizer.ll b/test/CodeGen/AMDGPU/packetizer.ll
index 49a7c0df748f..1764d64c367f 100644
--- a/test/CodeGen/AMDGPU/packetizer.ll
+++ b/test/CodeGen/AMDGPU/packetizer.ll
@@ -7,7 +7,7 @@
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
 ; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
 
-define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
 entry:
   %shl = sub i32 32, %e
   %x = add i32 %x_arg, 1
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index ea943a533c81..a90f200f79e3 100644
--- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -11,7 +11,7 @@
 ; to do its transfomation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
 
-define void @_Z9chk1D_512v() #0 {
+define amdgpu_kernel void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
   %b0 = alloca i32, align 4
diff --git a/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
index 1da1e91b8ab8..91116b0f65ea 100644
--- a/test/CodeGen/AMDGPU/parallelorifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
@@ -12,7 +12,7 @@
 ; CHECK: OR_INT
 ; CHECK-NEXT: OR_INT
 ; CHECK-NEXT: OR_INT
-define void @_Z9chk1D_512v() #0 {
+define amdgpu_kernel void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
   %b0 = alloca i32, align 4
diff --git a/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
new file mode 100644
index 000000000000..77d793201adc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -0,0 +1,638 @@
+; RUN: llc -O0 -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR -check-prefix=GCN %s
+
+; FIXME: we should disable sdwa peephole because dead-code elimination, that
+; runs after peephole, ruins this test (different register numbers)
+
+; Spill all SGPRs so multiple VGPRs are required for spilling all of them.
+
+; Ideally we only need 2 VGPRs for all spilling. The VGPRs are
+; allocated per-frame index, so it's possible to get up with more.
+
+; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs:
+
+; GCN: def s[8:15]
+; GCN: def s[16:23]
+; GCN: def s[24:31]
+; GCN: def s[32:39]
+; GCN: def s[40:47]
+; GCN: def s[48:55]
+; GCN: def s[56:63]
+; GCN: def s[64:71]
+; GCN: def s[72:79]
+; GCN: def s[80:87]
+; GCN: def s[88:95]
+
+; GCN: v_writelane_b32 v0, s8, 0
+; GCN-NEXT: v_writelane_b32 v0, s9, 1
+; GCN-NEXT: v_writelane_b32 v0, s10, 2
+; GCN-NEXT: v_writelane_b32 v0, s11, 3
+; GCN-NEXT: v_writelane_b32 v0, s12, 4
+; GCN-NEXT: v_writelane_b32 v0, s13, 5
+; GCN-NEXT: v_writelane_b32 v0, s14, 6
+; GCN-NEXT: v_writelane_b32 v0, s15, 7
+
+; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 9
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12
+; GCN-NEXT: v_writelane_b32 v0, s13, 13
+; GCN-NEXT: v_writelane_b32 v0, s14, 14
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 16
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 17
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20
+; GCN-NEXT: v_writelane_b32 v0, s13, 21
+; GCN-NEXT: v_writelane_b32 v0, s14, 22
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 24
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 25
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28
+; GCN-NEXT: v_writelane_b32 v0, s13, 29
+; GCN-NEXT: v_writelane_b32 v0, s14, 30
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 32
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 33
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36
+; GCN-NEXT: v_writelane_b32 v0, s13, 37
+; GCN-NEXT: v_writelane_b32 v0, s14, 38
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 40
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 41
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44
+; GCN-NEXT: v_writelane_b32 v0, s13, 45
+; GCN-NEXT: v_writelane_b32 v0, s14, 46
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 48
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 49
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
+; GCN-NEXT: v_writelane_b32 v0, s13, 53
+; GCN-NEXT: v_writelane_b32 v0, s14, 54
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
+
+; GCN-NEXT: v_writelane_b32 v0, s88, 56
+; GCN-NEXT: v_writelane_b32 v0, s89, 57
+; GCN-NEXT: v_writelane_b32 v0, s90, 58
+; GCN-NEXT: v_writelane_b32 v0, s91, 59
+; GCN-NEXT: v_writelane_b32 v0, s92, 60
+; GCN-NEXT: v_writelane_b32 v0, s93, 61
+; GCN-NEXT: v_writelane_b32 v0, s94, 62
+; GCN-NEXT: v_writelane_b32 v0, s95, 63
+; GCN-NEXT: v_writelane_b32 v1, s16, 0
+; GCN-NEXT: v_writelane_b32 v1, s17, 1
+; GCN-NEXT: v_writelane_b32 v1, s18, 2
+; GCN-NEXT: v_writelane_b32 v1, s19, 3
+; GCN-NEXT: v_writelane_b32 v1, s20, 4
+; GCN-NEXT: v_writelane_b32 v1, s21, 5
+; GCN-NEXT: v_writelane_b32 v1, s22, 6
+; GCN-NEXT: v_writelane_b32 v1, s23, 7
+; GCN-NEXT: v_writelane_b32 v1, s24, 8
+; GCN-NEXT: v_writelane_b32 v1, s25, 9
+; GCN-NEXT: v_writelane_b32 v1, s26, 10
+; GCN-NEXT: v_writelane_b32 v1, s27, 11
+; GCN-NEXT: v_writelane_b32 v1, s28, 12
+; GCN-NEXT: v_writelane_b32 v1, s29, 13
+; GCN-NEXT: v_writelane_b32 v1, s30, 14
+; GCN-NEXT: v_writelane_b32 v1, s31, 15
+; GCN-NEXT: v_writelane_b32 v1, s32, 16
+; GCN-NEXT: v_writelane_b32 v1, s33, 17
+; GCN-NEXT: v_writelane_b32 v1, s34, 18
+; GCN-NEXT: v_writelane_b32 v1, s35, 19
+; GCN-NEXT: v_writelane_b32 v1, s36, 20
+; GCN-NEXT: v_writelane_b32 v1, s37, 21
+; GCN-NEXT: v_writelane_b32 v1, s38, 22
+; GCN-NEXT: v_writelane_b32 v1, s39, 23
+; GCN-NEXT: v_writelane_b32 v1, s40, 24
+; GCN-NEXT: v_writelane_b32 v1, s41, 25
+; GCN-NEXT: v_writelane_b32 v1, s42, 26
+; GCN-NEXT: v_writelane_b32 v1, s43, 27
+; GCN-NEXT: v_writelane_b32 v1, s44, 28
+; GCN-NEXT: v_writelane_b32 v1, s45, 29
+; GCN-NEXT: v_writelane_b32 v1, s46, 30
+; GCN-NEXT: v_writelane_b32 v1, s47, 31
+; GCN-NEXT: v_writelane_b32 v1, s48, 32
+; GCN-NEXT: v_writelane_b32 v1, s49, 33
+; GCN-NEXT: v_writelane_b32 v1, s50, 34
+; GCN-NEXT: v_writelane_b32 v1, s51, 35
+; GCN-NEXT: v_writelane_b32 v1, s52, 36
+; GCN-NEXT: v_writelane_b32 v1, s53, 37
+; GCN-NEXT: v_writelane_b32 v1, s54, 38
+; GCN-NEXT: v_writelane_b32 v1, s55, 39
+; GCN-NEXT: v_writelane_b32 v1, s56, 40
+; GCN-NEXT: v_writelane_b32 v1, s57, 41
+; GCN-NEXT: v_writelane_b32 v1, s58, 42
+; GCN-NEXT: v_writelane_b32 v1, s59, 43
+; GCN-NEXT: v_writelane_b32 v1, s60, 44
+; GCN-NEXT: v_writelane_b32 v1, s61, 45
+; GCN-NEXT: v_writelane_b32 v1, s62, 46
+; GCN-NEXT: v_writelane_b32 v1, s63, 47
+; GCN-NEXT: v_writelane_b32 v1, s64, 48
+; GCN-NEXT: v_writelane_b32 v1, s65, 49
+; GCN-NEXT: v_writelane_b32 v1, s66, 50
+; GCN-NEXT: v_writelane_b32 v1, s67, 51
+; GCN-NEXT: v_writelane_b32 v1, s68, 52
+; GCN-NEXT: v_writelane_b32 v1, s69, 53
+; GCN-NEXT: v_writelane_b32 v1, s70, 54
+; GCN-NEXT: v_writelane_b32 v1, s71, 55
+; GCN-NEXT: v_writelane_b32 v1, s72, 56
+; GCN-NEXT: v_writelane_b32 v1, s73, 57
+; GCN-NEXT: v_writelane_b32 v1, s74, 58
+; GCN-NEXT: v_writelane_b32 v1, s75, 59
+; GCN-NEXT: v_writelane_b32 v1, s76, 60
+; GCN-NEXT: v_writelane_b32 v1, s77, 61
+; GCN-NEXT: v_writelane_b32 v1, s78, 62
+; GCN-NEXT: v_writelane_b32 v1, s79, 63
+; GCN-NEXT: v_writelane_b32 v2, s80, 0
+; GCN-NEXT: v_writelane_b32 v2, s81, 1
+; GCN-NEXT: v_writelane_b32 v2, s82, 2
+; GCN-NEXT: v_writelane_b32 v2, s83, 3
+; GCN-NEXT: v_writelane_b32 v2, s84, 4
+; GCN-NEXT: v_writelane_b32 v2, s85, 5
+; GCN-NEXT: v_writelane_b32 v2, s86, 6
+; GCN-NEXT: v_writelane_b32 v2, s87, 7
+; GCN: s_cbranch_scc1
+
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 6
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 7
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 6
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 7
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 8
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 9
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 10
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 11
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 12
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 13
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 14
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 15
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 16
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 17
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 18
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 19
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 20
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 21
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 22
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 23
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 24
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 25
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 26
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 27
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 28
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 29
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 30
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 31
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 32
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 33
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 34
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 35
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 36
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 37
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 38
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 39
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 40
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 41
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 42
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 43
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 44
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 45
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 46
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 47
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 48
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 49
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 50
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 51
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 52
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 53
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 54
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 55
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 56
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 57
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 58
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 59
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 60
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 61
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 62
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 63
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v2, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 6
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 7
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 56
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 59
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 60
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 61
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 62
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 63
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 8
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 9
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 10
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 11
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 12
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 13
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 14
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 15
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 16
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 17
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 18
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 19
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 20
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 21
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 22
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 23
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 24
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 25
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 26
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 27
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 28
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 29
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 30
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 31
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 32
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 33
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 34
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 35
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 36
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 37
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 38
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 39
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 40
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 41
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 42
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 43
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 44
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 45
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 46
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 47
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 48
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 55
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 {
+  %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr2 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr4 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr5 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr6 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr7 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr8 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr9 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr10 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr11 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr12 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr13 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr14 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr15 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr16 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr0) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr1) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr2) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr4) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr5) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr6) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr7) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr8) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr9) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr10) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr11) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr12) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr13) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr14) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr15) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr16) #0
+  br label %ret
+
+ret:
+  ret void
+}
+
+; Some of the lanes of an SGPR spill are in one VGPR and some forced
+; into the next available VGPR.
+
+; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs:
+; GCN: def s[24:39]
+
+; GCN: v_writelane_b32 v0, s24, 50
+; GCN-NEXT: v_writelane_b32 v0, s25, 51
+; GCN-NEXT: v_writelane_b32 v0, s26, 52
+; GCN-NEXT: v_writelane_b32 v0, s27, 53
+; GCN-NEXT: v_writelane_b32 v0, s28, 54
+; GCN-NEXT: v_writelane_b32 v0, s29, 55
+; GCN-NEXT: v_writelane_b32 v0, s30, 56
+; GCN-NEXT: v_writelane_b32 v0, s31, 57
+; GCN-NEXT: v_writelane_b32 v0, s32, 58
+; GCN-NEXT: v_writelane_b32 v0, s33, 59
+; GCN-NEXT: v_writelane_b32 v0, s34, 60
+; GCN-NEXT: v_writelane_b32 v0, s35, 61
+; GCN-NEXT: v_writelane_b32 v0, s36, 62
+; GCN-NEXT: v_writelane_b32 v0, s37, 63
+; GCN-NEXT: v_writelane_b32 v1, s38, 0
+; GCN-NEXT: v_writelane_b32 v1, s39, 1
+
+; GCN: v_readlane_b32 s4, v0, 50
+; GCN-NEXT: v_readlane_b32 s5, v0, 51
+; GCN-NEXT: v_readlane_b32 s6, v0, 52
+; GCN-NEXT: v_readlane_b32 s7, v0, 53
+; GCN-NEXT: v_readlane_b32 s8, v0, 54
+; GCN-NEXT: v_readlane_b32 s9, v0, 55
+; GCN-NEXT: v_readlane_b32 s10, v0, 56
+; GCN-NEXT: v_readlane_b32 s11, v0, 57
+; GCN-NEXT: v_readlane_b32 s12, v0, 58
+; GCN-NEXT: v_readlane_b32 s13, v0, 59
+; GCN-NEXT: v_readlane_b32 s14, v0, 60
+; GCN-NEXT: v_readlane_b32 s15, v0, 61
+; GCN-NEXT: v_readlane_b32 s16, v0, 62
+; GCN-NEXT: v_readlane_b32 s17, v0, 63
+; GCN-NEXT: v_readlane_b32 s18, v1, 0
+; GCN-NEXT: v_readlane_b32 s19, v1, 1
+define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 {
+  %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr5 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
+
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0
+  call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr5) #0
+  br label %ret
+
+ret:
+  ret void
+}
+
+; The first 64 SGPR spills can go to a VGPR, but there isn't a second
+; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element.
+
+; GCN-LABEL: {{^}}no_vgprs_last_sgpr_spill:
+
+; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 0
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 1
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 2
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 3
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 4
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 5
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 6
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 7
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 8
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 9
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 10
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 11
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 12
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 13
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 14
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 15
+
+; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 16
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 17
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 18
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 19
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 20
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 21
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 22
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 23
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 24
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 25
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 26
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 27
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 28
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 29
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 30
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31
+
+; GCN: def s[0:1]
+; GCN:      v_writelane_b32 v23, s0, 32
+; GCN-NEXT: v_writelane_b32 v23, s1, 33
+
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 36
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 37
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 38
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 39
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 40
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 41
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 42
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 43
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 44
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 45
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 46
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 47
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 48
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49
+
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: s_cbranch_scc1
+
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 6
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 7
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 8
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 9
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 10
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 11
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 12
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 13
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 14
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 15
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 34
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 35
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 36
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 37
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 38
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 39
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 40
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 41
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 42
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 43
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 44
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 45
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 46
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 47
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 48
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 49
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 17
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 18
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 19
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 20
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 21
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 22
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 23
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 24
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 25
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 26
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 27
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 28
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 29
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 30
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+
+; GCN: v_readlane_b32 s0, v23, 32
+; GCN: v_readlane_b32 s1, v23, 33
+; GCN: ;;#ASMSTART
+; GCN: ; use s[0:1]
+define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
+  call void asm sideeffect "", "~{VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}" () #0
+  call void asm sideeffect "", "~{VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}" () #0
+  call void asm sideeffect "", "~{VGPR16_VGPR17_VGPR18_VGPR19}"() #0
+  call void asm sideeffect "", "~{VGPR20_VGPR21}"() #0
+  call void asm sideeffect "", "~{VGPR22}"() #0
+
+  %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0
+  call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0
+  br label %ret
+
+ret:
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
index 3e0d36978ad4..4bcfe5f3d28c 100644
--- a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
+++ b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
@@ -10,7 +10,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %in.gep
diff --git a/test/CodeGen/AMDGPU/predicates.ll b/test/CodeGen/AMDGPU/predicates.ll
index c1af815c7b1e..566b48eb8864 100644
--- a/test/CodeGen/AMDGPU/predicates.ll
+++ b/test/CodeGen/AMDGPU/predicates.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: {{^}}simple_if:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %cmp0 = icmp sgt i32 %in, 0
   br i1 %cmp0, label %IF, label %ENDIF
@@ -25,7 +25,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF, label %ELSE
@@ -51,7 +51,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
@@ -79,7 +79,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
diff --git a/test/CodeGen/AMDGPU/private-access-no-objects.ll b/test/CodeGen/AMDGPU/private-access-no-objects.ll
index 2894730eccb1..af2683510293 100644
--- a/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -18,7 +18,7 @@
 
 ; OPTNONE-NOT: s_mov_b32
 ; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
-define void @store_to_undef() #0 {
+define amdgpu_kernel void @store_to_undef() #0 {
   store volatile i32 0, i32* undef
   ret void
 }
@@ -28,7 +28,7 @@ define void @store_to_undef() #0 {
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @store_to_inttoptr() #0 {
+define amdgpu_kernel void @store_to_inttoptr() #0 {
  store volatile i32 0, i32* inttoptr (i32 123 to i32*)
  ret void
 }
@@ -38,7 +38,7 @@ define void @store_to_inttoptr() #0 {
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @load_from_undef() #0 {
+define amdgpu_kernel void @load_from_undef() #0 {
   %ld = load volatile i32, i32* undef
   ret void
 }
@@ -48,7 +48,7 @@ define void @load_from_undef() #0 {
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @load_from_inttoptr() #0 {
+define amdgpu_kernel void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32* inttoptr (i32 123 to i32*)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll
index de9a8f755122..f80543079701 100644
--- a/test/CodeGen/AMDGPU/private-element-size.ll
+++ b/test/CodeGen/AMDGPU/private-element-size.ll
@@ -10,33 +10,33 @@
 ; HSA-ELT4: private_element_size = 1
 
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24{{$}}
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
 
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}}
 
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
-define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -59,36 +59,28 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80
 
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:56
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:88
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:72
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64
 
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
@@ -97,6 +89,14 @@ entry:
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:52{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:56{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:60{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:64{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:68{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:72{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:76{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:80{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:84{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:88{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:92{{$}}
 
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
@@ -106,7 +106,7 @@ entry:
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
-define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -130,20 +130,20 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:1
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:2
 
 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -166,20 +166,20 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
 
 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -202,33 +202,33 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16{{$}}
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
-define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
diff --git a/test/CodeGen/AMDGPU/private-memory-atomics.ll b/test/CodeGen/AMDGPU/private-memory-atomics.ll
index eea10c862238..9fa3051928a0 100644
--- a/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -4,7 +4,7 @@
 ; This works because promote allocas pass replaces these with LDS atomics.
 
 ; Private atomics have no real use, but at least shouldn't crash on it.
-define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -17,7 +17,7 @@ entry:
   ret void
 }
 
-define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/private-memory-broken.ll b/test/CodeGen/AMDGPU/private-memory-broken.ll
index 8ba0b70dbdbb..9b5f655f1b52 100644
--- a/test/CodeGen/AMDGPU/private-memory-broken.ll
+++ b/test/CodeGen/AMDGPU/private-memory-broken.ll
@@ -7,7 +7,7 @@
 
 declare i32 @foo(i32*) nounwind
 
-define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/private-memory-r600.ll b/test/CodeGen/AMDGPU/private-memory-r600.ll
index 3e1796959aa6..866cd16ec3b5 100644
--- a/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -12,11 +12,11 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0
 ; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0
-; OPT: call i32 @llvm.r600.read.tidig.x(), !range !0
-; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
-; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
+; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1
+; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1
+; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1
 
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -47,7 +47,7 @@ entry:
 ; R600-NOT: MOVA_INT
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -75,7 +75,7 @@ entry:
 ; FUNC-LABEL: {{^}}direct_loop:
 ; R600-NOT: MOVA_INT
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -110,7 +110,7 @@ for.end:
 ; FUNC-LABEL: {{^}}short_array:
 
 ; R600: MOVA_INT
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -127,7 +127,7 @@ entry:
 ; FUNC-LABEL: {{^}}char_array:
 
 ; R600: MOVA_INT
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -148,7 +148,7 @@ entry:
 ; R600-NOT: MOV T0.X
 ; Additional check in case the move ends up in the last slot
 ; R600-NOT: MOV * TO.X
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
@@ -169,7 +169,7 @@ entry:
 ; R600_CHECK: MOV
 ; R600_CHECK: [[CHAN:[XYZW]]]+
 ; R600-NOT: [[CHAN]]+
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -193,7 +193,7 @@ entry:
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -207,7 +207,7 @@ entry:
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -220,7 +220,7 @@ entry:
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -235,7 +235,7 @@ entry:
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -248,7 +248,7 @@ entry:
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -261,7 +261,7 @@ entry:
   ret void
 }
 
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds  [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -282,7 +282,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
@@ -295,6 +295,7 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   ret void
 }
 
-; OPT: !0 = !{i32 0, i32 2048}
+; OPT: !0 = !{i32 0, i32 257}
+; OPT: !1 = !{i32 0, i32 256}
 
 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
index 3bd0aecf7aa9..41a68b18b0a7 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: @array_alloca(
 ; CHECK: %stack = alloca i32, i32 5, align 4
-define void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca i32, i32 5, align 4
   %ld0 = load i32, i32 addrspace(1)* %in, align 4
@@ -27,7 +27,7 @@ entry:
 
 ; CHECK-LABEL: @array_alloca_dynamic(
 ; CHECK: %stack = alloca i32, i32 %size, align 4
-define void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
+define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
 entry:
   %stack = alloca i32, i32 %size, align 4
   %ld0 = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
index 82030f377d9f..a5eb92de9e4b 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
@@ -7,14 +7,14 @@ declare void @foo(float*) #0
 declare void @foo.varargs(...) #0
 
 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo
-define void @crash_call_constexpr_cast() #0 {
+define amdgpu_kernel void @crash_call_constexpr_cast() #0 {
   %alloca = alloca i32
   call void bitcast (void (float*)* @foo to void (i32*)*)(i32* %alloca) #0
   ret void
 }
 
 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs
-define void @crash_call_constexpr_cast_varargs() #0 {
+define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 {
   %alloca = alloca i32
   call void bitcast (void (...)* @foo.varargs to void (i32*)*)(i32* %alloca) #0
   ret void
diff --git a/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
index eb0d0cc62697..38db51d4c8c6 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -5,12 +5,12 @@
 @global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
-; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 ; IR: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
 ; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
 
-define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [10 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
index 4c3c15dac0d1..f83eb56dc6ed 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -7,7 +7,7 @@ declare i8* @llvm.invariant.group.barrier(i8*) #1
 ; GCN-LABEL: {{^}}use_invariant_promotable_lds:
 ; GCN: buffer_load_dword
 ; GCN: ds_write_b32
-define void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
index eeda19fa27ac..bd4571a9616b 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
@@ -1,21 +1,21 @@
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca %s | FileCheck -check-prefix=OPT %s
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 ; OPT-LABEL: @use_lifetime_promotable_lds(
 ; OPT-NOT: alloca i32
 ; OPT-NOT: llvm.lifetime
 ; OPT: store i32 %tmp3, i32 addrspace(3)*
-define void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %tmp1)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %tmp1)
   %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
   %tmp3 = load i32, i32 addrspace(1)* %tmp2
   store i32 %tmp3, i32* %tmp
-  call void @llvm.lifetime.end(i64 4, i8* %tmp1)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %tmp1)
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
index 9cea1a23ea98..7a4a451ff360 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -8,13 +8,13 @@ declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocaptu
 
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
 
-declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) #1
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) #1
 
 ; CHECK-LABEL: @promote_with_memcpy(
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
-define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -28,7 +28,7 @@ define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
-define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -41,7 +41,7 @@ define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; CHECK-LABEL: @promote_with_memset(
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false)
-define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -52,11 +52,11 @@ define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; CHECK-LABEL: @promote_with_objectsize(
 ; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false)
-define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
+; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false)
+define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
-  %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false)
+  %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false, i1 false)
   store i32 %size, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
index 8ba849e5f884..9f22f2071797 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -5,7 +5,7 @@
 ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
 ; NOOPTS-NOT ds_write
 ; OPTS: ds_write
-define void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -21,7 +21,7 @@ entry:
 ; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array:
 ; ALL: workgroup_group_segment_byte_size = 0{{$}}
 ; ALL-NOT ds_write
-define void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
index 468a789e4a67..bf3bc493a4b8 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -30,7 +30,7 @@
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_0:
 ; GCN: workgroup_group_segment_byte_size = 2340
-define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
@@ -62,7 +62,7 @@ entry:
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_1:
 ; GCN: workgroup_group_segment_byte_size = 2352
-define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
@@ -100,7 +100,7 @@ entry:
 
 ; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
 ; GCN: workgroup_group_segment_byte_size = 1060
-define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
index 3bcbb4f986b7..03ce116cfcad 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}stored_lds_pointer_value:
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
   %tmp = alloca float
   store float 0.0, float *%tmp
   store float* %tmp, float* addrspace(1)* %ptr
@@ -14,7 +14,7 @@ define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
 
 ; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   store float 0.0, float *%tmp0
@@ -29,7 +29,7 @@ define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 {
 bb:
   %tmp = alloca float, i32 16
   store float 0.0, float* %tmp
@@ -46,7 +46,7 @@ bb:
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
 entry:
   %tmp0 = alloca [4 x i32]
   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
@@ -64,7 +64,7 @@ entry:
 
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-NOT: ds_
-define void @stored_fi_to_self() #0 {
+define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
   store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
   %bitcast = bitcast i32** %tmp to i32*
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index 2e7527dbdbc4..ebef61229905 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -8,7 +8,7 @@
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1
-define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -22,7 +22,7 @@ define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %ou
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, null
-define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %cmp = icmp eq i32* %ptr0, null
@@ -35,7 +35,7 @@ define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* null, %ptr0
-define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %cmp = icmp eq i32* null, %ptr0
@@ -49,7 +49,7 @@ define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
 ; CHECK: %ptr1 = call i32* @get_unknown_pointer()
 ; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1
-define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = call i32* @get_unknown_pointer()
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
index 0462a351c39b..d196897d67dc 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -13,7 +13,7 @@
 ; CHECK: endif:
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
 ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
-define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %else
@@ -34,7 +34,7 @@ endif:
 
 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_0(
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ]
-define void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %endif
@@ -51,7 +51,7 @@ endif:
 
 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_1(
 ; CHECK: %phi.ptr = phi i32 addrspace(3)*  [ null, %entry ], [ %arrayidx0, %if ]
-define void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %endif
@@ -73,7 +73,7 @@ endif:
 ; CHECK: br label %exit
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
 ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
-define void @one_phi_value(i32 %a) #0 {
+define amdgpu_kernel void @one_phi_value(i32 %a) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
@@ -97,7 +97,7 @@ exit:
 ; CHECK: endif:
 ; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
 ; CHECK: store i32 0, i32* %phi.ptr, align 4
-define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %else
@@ -134,7 +134,7 @@ endif:
 ; CHECK-LABEL: @ptr_induction_var_same_alloca(
 ; CHECK: %alloca = alloca [64 x i32], align 4
 ; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
-define void @ptr_induction_var_same_alloca() #0 {
+define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
@@ -172,7 +172,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: %alloca = alloca [64 x i32], align 4
 ; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
 ; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call
-define void @ptr_induction_var_alloca_unknown() #0 {
+define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
index 34d274df7387..55c2229fb6bd 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
 ; CHECK: %alloca = alloca i32
 ; CHECK: select i1 undef, i32* undef, i32* %alloca
-define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
   %alloca = alloca i32, align 4
   %select = select i1 undef, i32* undef, i32* %alloca
   store i32 0, i32* %select, align 4
@@ -16,7 +16,7 @@ define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
-define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -33,7 +33,7 @@ define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
 ; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
 ; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
 ; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
-define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
   %alloca0 = alloca i32, i32 16, align 4
   %alloca1 = alloca i32, i32 16, align 4
   %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
@@ -50,7 +50,7 @@ define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
-define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3
@@ -67,7 +67,7 @@ define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
 ; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
 ; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
-define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -78,7 +78,7 @@ define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0
   ret void
 }
 
-define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
@@ -102,7 +102,7 @@ bb2:
 ; CHECK-LABEL: @select_null_rhs(
 ; CHECK-NOT: alloca
 ; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null
-define void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
@@ -117,7 +117,7 @@ bb:
 ; CHECK-LABEL: @select_null_lhs(
 ; CHECK-NOT: alloca
 ; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}}
-define void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
diff --git a/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
index e331731f90f6..88c0e911662d 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
@@ -8,7 +8,7 @@ declare void @llvm.stackrestore(i8*) #2
 ; CHECK-LABEL: @try_promote_unhandled_intrinsic(
 ; CHECK: alloca
 ; CHECK: call void @llvm.stackrestore(i8* %tmp1)
-define void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
index f9de38839bc5..9c43a6dc60f4 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
@@ -2,8 +2,8 @@
 
 ; CHECK-LABEL: @volatile_load(
 ; CHECK: alloca [5 x i32]
-; CHECK load volatile i32, i32*
-define void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; CHECK: load volatile i32, i32*
+define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -15,8 +15,8 @@ entry:
 
 ; CHECK-LABEL: @volatile_store(
 ; CHECK: alloca [5 x i32]
-; CHECK store volatile i32 %tmp, i32*
-define void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; CHECK: store volatile i32 %tmp, i32*
+define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -30,7 +30,7 @@ entry:
 ; CHECK: alloca double
 ; CHECK: load double
 ; CHECK: load volatile double
-define void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
diff --git a/test/CodeGen/AMDGPU/pv.ll b/test/CodeGen/AMDGPU/pv.ll
index d5f9833d6ad0..1474dbabba69 100644
--- a/test/CodeGen/AMDGPU/pv.ll
+++ b/test/CodeGen/AMDGPU/pv.ll
@@ -1,240 +1,236 @@
-; RUN: llc < %s -march=r600 | FileCheck %s
+; RUN: llc -march=r600 < %s | FileCheck %s
 
 ; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
 ; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
-
 define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) {
 main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = extractelement <4 x float> %reg2, i32 0
-  %5 = extractelement <4 x float> %reg2, i32 1
-  %6 = extractelement <4 x float> %reg2, i32 2
-  %7 = extractelement <4 x float> %reg2, i32 3
-  %8 = extractelement <4 x float> %reg3, i32 0
-  %9 = extractelement <4 x float> %reg3, i32 1
-  %10 = extractelement <4 x float> %reg3, i32 2
-  %11 = extractelement <4 x float> %reg3, i32 3
-  %12 = extractelement <4 x float> %reg4, i32 0
-  %13 = extractelement <4 x float> %reg4, i32 1
-  %14 = extractelement <4 x float> %reg4, i32 2
-  %15 = extractelement <4 x float> %reg4, i32 3
-  %16 = extractelement <4 x float> %reg5, i32 0
-  %17 = extractelement <4 x float> %reg5, i32 1
-  %18 = extractelement <4 x float> %reg5, i32 2
-  %19 = extractelement <4 x float> %reg5, i32 3
-  %20 = extractelement <4 x float> %reg6, i32 0
-  %21 = extractelement <4 x float> %reg6, i32 1
-  %22 = extractelement <4 x float> %reg6, i32 2
-  %23 = extractelement <4 x float> %reg6, i32 3
-  %24 = extractelement <4 x float> %reg7, i32 0
-  %25 = extractelement <4 x float> %reg7, i32 1
-  %26 = extractelement <4 x float> %reg7, i32 2
-  %27 = extractelement <4 x float> %reg7, i32 3
-  %28 = load <4 x float>, <4 x float> addrspace(8)* null
-  %29 = extractelement <4 x float> %28, i32 0
-  %30 = fmul float %0, %29
-  %31 = load <4 x float>, <4 x float> addrspace(8)* null
-  %32 = extractelement <4 x float> %31, i32 1
-  %33 = fmul float %0, %32
-  %34 = load <4 x float>, <4 x float> addrspace(8)* null
-  %35 = extractelement <4 x float> %34, i32 2
-  %36 = fmul float %0, %35
-  %37 = load <4 x float>, <4 x float> addrspace(8)* null
-  %38 = extractelement <4 x float> %37, i32 3
-  %39 = fmul float %0, %38
-  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %41 = extractelement <4 x float> %40, i32 0
-  %42 = fmul float %1, %41
-  %43 = fadd float %42, %30
-  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %45 = extractelement <4 x float> %44, i32 1
-  %46 = fmul float %1, %45
-  %47 = fadd float %46, %33
-  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %49 = extractelement <4 x float> %48, i32 2
-  %50 = fmul float %1, %49
-  %51 = fadd float %50, %36
-  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %53 = extractelement <4 x float> %52, i32 3
-  %54 = fmul float %1, %53
-  %55 = fadd float %54, %39
-  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %57 = extractelement <4 x float> %56, i32 0
-  %58 = fmul float %2, %57
-  %59 = fadd float %58, %43
-  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %61 = extractelement <4 x float> %60, i32 1
-  %62 = fmul float %2, %61
-  %63 = fadd float %62, %47
-  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %65 = extractelement <4 x float> %64, i32 2
-  %66 = fmul float %2, %65
-  %67 = fadd float %66, %51
-  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %69 = extractelement <4 x float> %68, i32 3
-  %70 = fmul float %2, %69
-  %71 = fadd float %70, %55
-  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %73 = extractelement <4 x float> %72, i32 0
-  %74 = fmul float %3, %73
-  %75 = fadd float %74, %59
-  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %77 = extractelement <4 x float> %76, i32 1
-  %78 = fmul float %3, %77
-  %79 = fadd float %78, %63
-  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %81 = extractelement <4 x float> %80, i32 2
-  %82 = fmul float %3, %81
-  %83 = fadd float %82, %67
-  %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %85 = extractelement <4 x float> %84, i32 3
-  %86 = fmul float %3, %85
-  %87 = fadd float %86, %71
-  %88 = insertelement <4 x float> undef, float %4, i32 0
-  %89 = insertelement <4 x float> %88, float %5, i32 1
-  %90 = insertelement <4 x float> %89, float %6, i32 2
-  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3
-  %92 = insertelement <4 x float> undef, float %4, i32 0
-  %93 = insertelement <4 x float> %92, float %5, i32 1
-  %94 = insertelement <4 x float> %93, float %6, i32 2
-  %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
-  %96 = call float @llvm.r600.dot4(<4 x float> %91, <4 x float> %95)
-  %97 = call float @llvm.fabs.f32(float %96)
-  %98 = call float @llvm.r600.recipsqrt.clamped.f32(float %97)
-  %99 = fmul float %4, %98
-  %100 = fmul float %5, %98
-  %101 = fmul float %6, %98
-  %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %103 = extractelement <4 x float> %102, i32 0
-  %104 = fmul float %103, %8
-  %105 = fadd float %104, %20
-  %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %107 = extractelement <4 x float> %106, i32 1
-  %108 = fmul float %107, %9
-  %109 = fadd float %108, %21
-  %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %111 = extractelement <4 x float> %110, i32 2
-  %112 = fmul float %111, %10
-  %113 = fadd float %112, %22
-  %114 = call float @llvm.AMDGPU.clamp.f32(float %105, float 0.000000e+00, float 1.000000e+00)
-  %115 = call float @llvm.AMDGPU.clamp.f32(float %109, float 0.000000e+00, float 1.000000e+00)
-  %116 = call float @llvm.AMDGPU.clamp.f32(float %113, float 0.000000e+00, float 1.000000e+00)
-  %117 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
-  %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %119 = extractelement <4 x float> %118, i32 0
-  %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %121 = extractelement <4 x float> %120, i32 1
-  %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %123 = extractelement <4 x float> %122, i32 2
-  %124 = insertelement <4 x float> undef, float %99, i32 0
-  %125 = insertelement <4 x float> %124, float %100, i32 1
-  %126 = insertelement <4 x float> %125, float %101, i32 2
-  %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3
-  %128 = insertelement <4 x float> undef, float %119, i32 0
-  %129 = insertelement <4 x float> %128, float %121, i32 1
-  %130 = insertelement <4 x float> %129, float %123, i32 2
-  %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
-  %132 = call float @llvm.r600.dot4(<4 x float> %127, <4 x float> %131)
-  %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %134 = extractelement <4 x float> %133, i32 0
-  %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %136 = extractelement <4 x float> %135, i32 1
-  %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %138 = extractelement <4 x float> %137, i32 2
-  %139 = insertelement <4 x float> undef, float %99, i32 0
-  %140 = insertelement <4 x float> %139, float %100, i32 1
-  %141 = insertelement <4 x float> %140, float %101, i32 2
-  %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3
-  %143 = insertelement <4 x float> undef, float %134, i32 0
-  %144 = insertelement <4 x float> %143, float %136, i32 1
-  %145 = insertelement <4 x float> %144, float %138, i32 2
-  %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
-  %147 = call float @llvm.r600.dot4(<4 x float> %142, <4 x float> %146)
-  %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %149 = extractelement <4 x float> %148, i32 0
-  %150 = fmul float %149, %8
-  %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %152 = extractelement <4 x float> %151, i32 1
-  %153 = fmul float %152, %9
-  %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %155 = extractelement <4 x float> %154, i32 2
-  %156 = fmul float %155, %10
-  %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %158 = extractelement <4 x float> %157, i32 0
-  %159 = fmul float %158, %12
-  %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %161 = extractelement <4 x float> %160, i32 1
-  %162 = fmul float %161, %13
-  %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %164 = extractelement <4 x float> %163, i32 2
-  %165 = fmul float %164, %14
-  %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %167 = extractelement <4 x float> %166, i32 0
-  %168 = fmul float %167, %16
-  %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %170 = extractelement <4 x float> %169, i32 1
-  %171 = fmul float %170, %17
-  %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %173 = extractelement <4 x float> %172, i32 2
-  %174 = fmul float %173, %18
-  %175 = fcmp uge float %132, 0.000000e+00
-  %176 = select i1 %175, float %132, float 0.000000e+00
-  %177 = fcmp uge float %147, 0.000000e+00
-  %178 = select i1 %177, float %147, float 0.000000e+00
-  %179 = call float @llvm.pow.f32(float %178, float %24)
-  %180 = fcmp ult float %132, 0.000000e+00
-  %181 = select i1 %180, float 0.000000e+00, float %179
-  %182 = fadd float %150, %105
-  %183 = fadd float %153, %109
-  %184 = fadd float %156, %113
-  %185 = fmul float %176, %159
-  %186 = fadd float %185, %182
-  %187 = fmul float %176, %162
-  %188 = fadd float %187, %183
-  %189 = fmul float %176, %165
-  %190 = fadd float %189, %184
-  %191 = fmul float %181, %168
-  %192 = fadd float %191, %186
-  %193 = fmul float %181, %171
-  %194 = fadd float %193, %188
-  %195 = fmul float %181, %174
-  %196 = fadd float %195, %190
-  %197 = call float @llvm.AMDGPU.clamp.f32(float %192, float 0.000000e+00, float 1.000000e+00)
-  %198 = call float @llvm.AMDGPU.clamp.f32(float %194, float 0.000000e+00, float 1.000000e+00)
-  %199 = call float @llvm.AMDGPU.clamp.f32(float %196, float 0.000000e+00, float 1.000000e+00)
-  %200 = insertelement <4 x float> undef, float %75, i32 0
-  %201 = insertelement <4 x float> %200, float %79, i32 1
-  %202 = insertelement <4 x float> %201, float %83, i32 2
-  %203 = insertelement <4 x float> %202, float %87, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %203, i32 60, i32 1)
-  %204 = insertelement <4 x float> undef, float %197, i32 0
-  %205 = insertelement <4 x float> %204, float %198, i32 1
-  %206 = insertelement <4 x float> %205, float %199, i32 2
-  %207 = insertelement <4 x float> %206, float %117, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %207, i32 0, i32 2)
+  %tmp = extractelement <4 x float> %reg1, i32 0
+  %tmp13 = extractelement <4 x float> %reg1, i32 1
+  %tmp14 = extractelement <4 x float> %reg1, i32 2
+  %tmp15 = extractelement <4 x float> %reg1, i32 3
+  %tmp16 = extractelement <4 x float> %reg2, i32 0
+  %tmp17 = extractelement <4 x float> %reg2, i32 1
+  %tmp18 = extractelement <4 x float> %reg2, i32 2
+  %tmp19 = extractelement <4 x float> %reg2, i32 3
+  %tmp20 = extractelement <4 x float> %reg3, i32 0
+  %tmp21 = extractelement <4 x float> %reg3, i32 1
+  %tmp22 = extractelement <4 x float> %reg3, i32 2
+  %tmp23 = extractelement <4 x float> %reg3, i32 3
+  %tmp24 = extractelement <4 x float> %reg4, i32 0
+  %tmp25 = extractelement <4 x float> %reg4, i32 1
+  %tmp26 = extractelement <4 x float> %reg4, i32 2
+  %tmp27 = extractelement <4 x float> %reg4, i32 3
+  %tmp28 = extractelement <4 x float> %reg5, i32 0
+  %tmp29 = extractelement <4 x float> %reg5, i32 1
+  %tmp30 = extractelement <4 x float> %reg5, i32 2
+  %tmp31 = extractelement <4 x float> %reg5, i32 3
+  %tmp32 = extractelement <4 x float> %reg6, i32 0
+  %tmp33 = extractelement <4 x float> %reg6, i32 1
+  %tmp34 = extractelement <4 x float> %reg6, i32 2
+  %tmp35 = extractelement <4 x float> %reg6, i32 3
+  %tmp36 = extractelement <4 x float> %reg7, i32 0
+  %tmp37 = extractelement <4 x float> %reg7, i32 1
+  %tmp38 = extractelement <4 x float> %reg7, i32 2
+  %tmp39 = extractelement <4 x float> %reg7, i32 3
+  %tmp40 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp41 = extractelement <4 x float> %tmp40, i32 0
+  %tmp42 = fmul float %tmp, %tmp41
+  %tmp43 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp44 = extractelement <4 x float> %tmp43, i32 1
+  %tmp45 = fmul float %tmp, %tmp44
+  %tmp46 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp47 = extractelement <4 x float> %tmp46, i32 2
+  %tmp48 = fmul float %tmp, %tmp47
+  %tmp49 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp50 = extractelement <4 x float> %tmp49, i32 3
+  %tmp51 = fmul float %tmp, %tmp50
+  %tmp52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp53 = extractelement <4 x float> %tmp52, i32 0
+  %tmp54 = fmul float %tmp13, %tmp53
+  %tmp55 = fadd float %tmp54, %tmp42
+  %tmp56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp57 = extractelement <4 x float> %tmp56, i32 1
+  %tmp58 = fmul float %tmp13, %tmp57
+  %tmp59 = fadd float %tmp58, %tmp45
+  %tmp60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp61 = extractelement <4 x float> %tmp60, i32 2
+  %tmp62 = fmul float %tmp13, %tmp61
+  %tmp63 = fadd float %tmp62, %tmp48
+  %tmp64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp65 = extractelement <4 x float> %tmp64, i32 3
+  %tmp66 = fmul float %tmp13, %tmp65
+  %tmp67 = fadd float %tmp66, %tmp51
+  %tmp68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp69 = extractelement <4 x float> %tmp68, i32 0
+  %tmp70 = fmul float %tmp14, %tmp69
+  %tmp71 = fadd float %tmp70, %tmp55
+  %tmp72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp73 = extractelement <4 x float> %tmp72, i32 1
+  %tmp74 = fmul float %tmp14, %tmp73
+  %tmp75 = fadd float %tmp74, %tmp59
+  %tmp76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp77 = extractelement <4 x float> %tmp76, i32 2
+  %tmp78 = fmul float %tmp14, %tmp77
+  %tmp79 = fadd float %tmp78, %tmp63
+  %tmp80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp81 = extractelement <4 x float> %tmp80, i32 3
+  %tmp82 = fmul float %tmp14, %tmp81
+  %tmp83 = fadd float %tmp82, %tmp67
+  %tmp84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp85 = extractelement <4 x float> %tmp84, i32 0
+  %tmp86 = fmul float %tmp15, %tmp85
+  %tmp87 = fadd float %tmp86, %tmp71
+  %tmp88 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp89 = extractelement <4 x float> %tmp88, i32 1
+  %tmp90 = fmul float %tmp15, %tmp89
+  %tmp91 = fadd float %tmp90, %tmp75
+  %tmp92 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp93 = extractelement <4 x float> %tmp92, i32 2
+  %tmp94 = fmul float %tmp15, %tmp93
+  %tmp95 = fadd float %tmp94, %tmp79
+  %tmp96 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp97 = extractelement <4 x float> %tmp96, i32 3
+  %tmp98 = fmul float %tmp15, %tmp97
+  %tmp99 = fadd float %tmp98, %tmp83
+  %tmp100 = insertelement <4 x float> undef, float %tmp16, i32 0
+  %tmp101 = insertelement <4 x float> %tmp100, float %tmp17, i32 1
+  %tmp102 = insertelement <4 x float> %tmp101, float %tmp18, i32 2
+  %tmp103 = insertelement <4 x float> %tmp102, float 0.000000e+00, i32 3
+  %tmp104 = insertelement <4 x float> undef, float %tmp16, i32 0
+  %tmp105 = insertelement <4 x float> %tmp104, float %tmp17, i32 1
+  %tmp106 = insertelement <4 x float> %tmp105, float %tmp18, i32 2
+  %tmp107 = insertelement <4 x float> %tmp106, float 0.000000e+00, i32 3
+  %tmp108 = call float @llvm.r600.dot4(<4 x float> %tmp103, <4 x float> %tmp107)
+  %tmp109 = call float @llvm.fabs.f32(float %tmp108)
+  %tmp110 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp109)
+  %tmp111 = fmul float %tmp16, %tmp110
+  %tmp112 = fmul float %tmp17, %tmp110
+  %tmp113 = fmul float %tmp18, %tmp110
+  %tmp114 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %tmp115 = extractelement <4 x float> %tmp114, i32 0
+  %tmp116 = fmul float %tmp115, %tmp20
+  %tmp117 = fadd float %tmp116, %tmp32
+  %tmp118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %tmp119 = extractelement <4 x float> %tmp118, i32 1
+  %tmp120 = fmul float %tmp119, %tmp21
+  %tmp121 = fadd float %tmp120, %tmp33
+  %tmp122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %tmp123 = extractelement <4 x float> %tmp122, i32 2
+  %tmp124 = fmul float %tmp123, %tmp22
+  %tmp125 = fadd float %tmp124, %tmp34
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp117, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i11 = call float @llvm.maxnum.f32(float %tmp121, float 0.000000e+00)
+  %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00)
+  %max.0.i9 = call float @llvm.maxnum.f32(float %tmp125, float 0.000000e+00)
+  %clamp.i10 = call float @llvm.minnum.f32(float %max.0.i9, float 1.000000e+00)
+  %max.0.i7 = call float @llvm.maxnum.f32(float %tmp27, float 0.000000e+00)
+  %clamp.i8 = call float @llvm.minnum.f32(float %max.0.i7, float 1.000000e+00)
+  %tmp126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %tmp127 = extractelement <4 x float> %tmp126, i32 0
+  %tmp128 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %tmp129 = extractelement <4 x float> %tmp128, i32 1
+  %tmp130 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %tmp131 = extractelement <4 x float> %tmp130, i32 2
+  %tmp132 = insertelement <4 x float> undef, float %tmp111, i32 0
+  %tmp133 = insertelement <4 x float> %tmp132, float %tmp112, i32 1
+  %tmp134 = insertelement <4 x float> %tmp133, float %tmp113, i32 2
+  %tmp135 = insertelement <4 x float> %tmp134, float 0.000000e+00, i32 3
+  %tmp136 = insertelement <4 x float> undef, float %tmp127, i32 0
+  %tmp137 = insertelement <4 x float> %tmp136, float %tmp129, i32 1
+  %tmp138 = insertelement <4 x float> %tmp137, float %tmp131, i32 2
+  %tmp139 = insertelement <4 x float> %tmp138, float 0.000000e+00, i32 3
+  %tmp140 = call float @llvm.r600.dot4(<4 x float> %tmp135, <4 x float> %tmp139)
+  %tmp141 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %tmp142 = extractelement <4 x float> %tmp141, i32 0
+  %tmp143 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %tmp144 = extractelement <4 x float> %tmp143, i32 1
+  %tmp145 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %tmp146 = extractelement <4 x float> %tmp145, i32 2
+  %tmp147 = insertelement <4 x float> undef, float %tmp111, i32 0
+  %tmp148 = insertelement <4 x float> %tmp147, float %tmp112, i32 1
+  %tmp149 = insertelement <4 x float> %tmp148, float %tmp113, i32 2
+  %tmp150 = insertelement <4 x float> %tmp149, float 0.000000e+00, i32 3
+  %tmp151 = insertelement <4 x float> undef, float %tmp142, i32 0
+  %tmp152 = insertelement <4 x float> %tmp151, float %tmp144, i32 1
+  %tmp153 = insertelement <4 x float> %tmp152, float %tmp146, i32 2
+  %tmp154 = insertelement <4 x float> %tmp153, float 0.000000e+00, i32 3
+  %tmp155 = call float @llvm.r600.dot4(<4 x float> %tmp150, <4 x float> %tmp154)
+  %tmp156 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %tmp157 = extractelement <4 x float> %tmp156, i32 0
+  %tmp158 = fmul float %tmp157, %tmp20
+  %tmp159 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %tmp160 = extractelement <4 x float> %tmp159, i32 1
+  %tmp161 = fmul float %tmp160, %tmp21
+  %tmp162 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %tmp163 = extractelement <4 x float> %tmp162, i32 2
+  %tmp164 = fmul float %tmp163, %tmp22
+  %tmp165 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp166 = extractelement <4 x float> %tmp165, i32 0
+  %tmp167 = fmul float %tmp166, %tmp24
+  %tmp168 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp169 = extractelement <4 x float> %tmp168, i32 1
+  %tmp170 = fmul float %tmp169, %tmp25
+  %tmp171 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp172 = extractelement <4 x float> %tmp171, i32 2
+  %tmp173 = fmul float %tmp172, %tmp26
+  %tmp174 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %tmp175 = extractelement <4 x float> %tmp174, i32 0
+  %tmp176 = fmul float %tmp175, %tmp28
+  %tmp177 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %tmp178 = extractelement <4 x float> %tmp177, i32 1
+  %tmp179 = fmul float %tmp178, %tmp29
+  %tmp180 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %tmp181 = extractelement <4 x float> %tmp180, i32 2
+  %tmp182 = fmul float %tmp181, %tmp30
+  %tmp183 = fcmp uge float %tmp140, 0.000000e+00
+  %tmp184 = select i1 %tmp183, float %tmp140, float 0.000000e+00
+  %tmp185 = fcmp uge float %tmp155, 0.000000e+00
+  %tmp186 = select i1 %tmp185, float %tmp155, float 0.000000e+00
+  %tmp187 = call float @llvm.pow.f32(float %tmp186, float %tmp36)
+  %tmp188 = fcmp ult float %tmp140, 0.000000e+00
+  %tmp189 = select i1 %tmp188, float 0.000000e+00, float %tmp187
+  %tmp190 = fadd float %tmp158, %tmp117
+  %tmp191 = fadd float %tmp161, %tmp121
+  %tmp192 = fadd float %tmp164, %tmp125
+  %tmp193 = fmul float %tmp184, %tmp167
+  %tmp194 = fadd float %tmp193, %tmp190
+  %tmp195 = fmul float %tmp184, %tmp170
+  %tmp196 = fadd float %tmp195, %tmp191
+  %tmp197 = fmul float %tmp184, %tmp173
+  %tmp198 = fadd float %tmp197, %tmp192
+  %tmp199 = fmul float %tmp189, %tmp176
+  %tmp200 = fadd float %tmp199, %tmp194
+  %tmp201 = fmul float %tmp189, %tmp179
+  %tmp202 = fadd float %tmp201, %tmp196
+  %tmp203 = fmul float %tmp189, %tmp182
+  %tmp204 = fadd float %tmp203, %tmp198
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp200, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp202, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp204, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp205 = insertelement <4 x float> undef, float %tmp87, i32 0
+  %tmp206 = insertelement <4 x float> %tmp205, float %tmp91, i32 1
+  %tmp207 = insertelement <4 x float> %tmp206, float %tmp95, i32 2
+  %tmp208 = insertelement <4 x float> %tmp207, float %tmp99, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp208, i32 60, i32 1)
+  %tmp209 = insertelement <4 x float> undef, float %clamp.i6, i32 0
+  %tmp210 = insertelement <4 x float> %tmp209, float %clamp.i4, i32 1
+  %tmp211 = insertelement <4 x float> %tmp210, float %clamp.i2, i32 2
+  %tmp212 = insertelement <4 x float> %tmp211, float %clamp.i8, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp212, i32 0, i32 2)
   ret void
 }
 
-; Function Attrs: readnone
-declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
-
-; Function Attrs: readonly
-declare float @llvm.fabs.f32(float) #1
-
-; Function Attrs: readnone
-declare float @llvm.r600.recipsqrt.clamped.f32(float) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
-
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #2
-
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #3
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.r600.recipsqrt.clamped.f32(float) #0
+declare float @llvm.pow.f32(float, float) #0
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #1
 
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly }
-attributes #3 = { nounwind }
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
index 461caf5b5d20..e2143ff85b72 100644
--- a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
+++ b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
@@ -10,7 +10,7 @@ main_body:
   %tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 1
   %tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 2
   %tmp8 = insertelement <4 x float> %tmp7, float %tmp4, i32 3
-  %tmp9 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp8)
+  %tmp9 = call <4 x float> @llvm.r600.cube(<4 x float> %tmp8)
   %tmp10 = extractelement <4 x float> %tmp9, i32 0
   %tmp11 = extractelement <4 x float> %tmp9, i32 1
   %tmp12 = extractelement <4 x float> %tmp9, i32 2
@@ -45,7 +45,7 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
+declare <4 x float> @llvm.r600.cube(<4 x float>) #0
 
 ; Function Attrs: readnone
 declare float @fabs(float) #0
diff --git a/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
index 866a4a9191e2..b7ed34bbf09b 100644
--- a/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
+++ b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
@@ -2,7 +2,7 @@
 ; Don't crash
 
 ; CHECK: MAX_UINT
-define void @test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i64 addrspace(1)* %out) {
 bb:
   store i64 2, i64 addrspace(1)* %out
   %tmp = load i64, i64 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/r600.alu-limits.ll b/test/CodeGen/AMDGPU/r600.alu-limits.ll
new file mode 100644
index 000000000000..2604ed4e574c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.alu-limits.ll
@@ -0,0 +1,29 @@
+; RUN: opt -loop-unroll -unroll-threshold=2000 -S < %s | llc -march=r600 -mcpu=cypress | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: {{^}}alu_limits:
+; CHECK: CF_END
+
+%struct.foo = type {i32, i32, i32}
+
+define amdgpu_kernel void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) {
+entry:
+  %ptr = getelementptr inbounds %struct.foo, %struct.foo* %in, i32 1, i32 2
+  %x = load i32, i32 *%ptr, align 4
+  br label %loop
+loop:
+  %i = phi i32 [ 100, %entry ], [ %nexti, %loop ]
+  %val = phi i32 [ 1, %entry ], [ %nextval, %loop ]
+
+  %nexti = sub i32 %i, 1
+
+  %y = xor i32 %x, %i
+  %nextval = mul i32 %val, %y
+
+  %cond = icmp ne i32 %nexti, 0
+  br i1 %cond, label %loop, label %end
+end:
+  %out_val = add i32 %nextval, 4
+  store i32 %out_val, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll b/test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll
new file mode 100644
index 000000000000..8956d113e8b5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll
@@ -0,0 +1,7 @@
+; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: NoAlias:      i8 addrspace(7)* %p1, i8* %p
+
+define amdgpu_kernel void @test(i8* %p, i8 addrspace(7)* %p1) {
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/r600.bitcast.ll b/test/CodeGen/AMDGPU/r600.bitcast.ll
index 49441ee8d186..acf7a66a357f 100644
--- a/test/CodeGen/AMDGPU/r600.bitcast.ll
+++ b/test/CodeGen/AMDGPU/r600.bitcast.ll
@@ -8,7 +8,7 @@
 ; EG: VTX_READ_128 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
   %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@@ -21,7 +21,7 @@ entry:
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %load to <2 x i16>
   store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
@@ -33,7 +33,7 @@ define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in)
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %bc = bitcast <2 x i16> %load to float
   store float %bc, float addrspace(1)* %out, align 4
@@ -45,7 +45,7 @@ define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in)
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
@@ -57,7 +57,7 @@ define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nou
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -69,7 +69,7 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16>  addrspace(1)* %in, align 4
   %bc = bitcast <2 x i16> %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -85,7 +85,7 @@ define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)*
 ; EG: VTX_READ_16
 ; EG-DAG: BFE_UINT
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
   %load = load <4 x i16>, <4 x i16>  addrspace(1)* %in, align 2
   %bc = bitcast <4 x i16> %load to <8 x i8>
   %element = extractelement <8 x i8> %bc, i32 5
@@ -98,7 +98,7 @@ define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in
 ; EG: VTX_READ_64 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bc = bitcast <2 x i32> %val to double
   store double %bc, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/r600.global_atomics.ll b/test/CodeGen/AMDGPU/r600.global_atomics.ll
new file mode 100644
index 000000000000..1ddc41feb006
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.global_atomics.ll
@@ -0,0 +1,542 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; TODO: Add _RTN versions and merge with the GCN test
+
+; FUNC-LABEL: {{^}}atomic_add_i32_offset:
+; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
+; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_huge_offset:
+; FIXME: looks like the offset is wrong
+; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
+
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32:
+; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
+; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_offset:
+; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32:
+; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
+; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
+; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32:
+; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
+; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_offset:
+; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32:
+; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
+; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
+; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32:
+; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
+; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_offset:
+; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32:
+; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
+; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
+; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32:
+; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
+; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_offset:
+; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32:
+; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
+; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
+; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32:
+; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
+; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
+; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
+; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+entry:
+  %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
+; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
+; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32:
+; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
+; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32_offset:
+; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32:
+; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+entry:
+  store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
+; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+  store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
+; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
+; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_inc_add
+; EG: MEM_RAT ATOMIC_INC_UINT
+define amdgpu_kernel void @atomic_inc_add(i32 addrspace(1)* %out) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_dec_add
+; EG: MEM_RAT ATOMIC_DEC_UINT
+define amdgpu_kernel void @atomic_dec_add(i32 addrspace(1)* %out) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 -1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_inc_sub
+; EG: MEM_RAT ATOMIC_INC_UINT
+define amdgpu_kernel void @atomic_inc_sub(i32 addrspace(1)* %out) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 -1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_dec_sub
+; EG: MEM_RAT ATOMIC_DEC_UINT
+define amdgpu_kernel void @atomic_dec_sub(i32 addrspace(1)* %out) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/r600.private-memory.ll b/test/CodeGen/AMDGPU/r600.private-memory.ll
index f406c160cbbe..53ee214f07ec 100644
--- a/test/CodeGen/AMDGPU/r600.private-memory.ll
+++ b/test/CodeGen/AMDGPU/r600.private-memory.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; Additional check in case the move ends up in the last slot
 ; R600-NOT: MOV * TO.X
 
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
index a34a48e3b7ba..9eee9a6effc9 100644
--- a/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
@@ -2,7 +2,7 @@
 
 ; FUNC-LABEL: {{^}}tgid_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.X
-define void @tgid_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tgid_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Y
-define void @tgid_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tgid_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Z
-define void @tgid_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X
-define void @tidig_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Y
-define void @tidig_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Z
-define void @tidig_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ entry:
 ; FUNC-LABEL: {{^}}test_implicit:
 ; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56
 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56
-define void @test_implicit(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 4
@@ -69,7 +69,7 @@ define void @test_implicit(i32 addrspace(1)* %out) #1 {
 ; FUNC-LABEL: {{^}}test_implicit_dyn:
 ; 36 prepended implicit bytes + 8(out pointer + in) = 44
 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44
-define void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
+define amdgpu_kernel void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 %in
diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll
index b7cc6d47cd87..fbdaeb829297 100644
--- a/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -9,7 +9,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -33,7 +33,7 @@ define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv fast float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -45,7 +45,7 @@ define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv arcp float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -57,7 +57,7 @@ define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
+define amdgpu_kernel void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -69,7 +69,7 @@ define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src)
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %rcp = fdiv float 1.0, %src.fabs
   store float %rcp, float addrspace(1)* %out, align 4
@@ -82,7 +82,7 @@ define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float -1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -92,7 +92,7 @@ define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]|
 ; GCN: buffer_store_dword [[RCP]]
-define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
   %rcp = fdiv float 1.0, %src.fabs.fneg
@@ -106,7 +106,7 @@ define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]|
 ; GCN: buffer_store_dword [[RCP]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
   %rcp = fdiv float 1.0, %src.fabs.fneg
@@ -117,6 +117,35 @@ define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %sr
   ret void
 }
 
+; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f32:
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 {
+  %x = load float, float addrspace(1)* undef
+  %rcp = fdiv arcp float %x, 2.0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f32:
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0x3dcccccd, v{{[0-9]+}}
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 {
+  %x = load float, float addrspace(1)* undef
+  %rcp = fdiv arcp float %x, 10.0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f32:
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbdcccccd, v{{[0-9]+}}
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 {
+  %x = load float, float addrspace(1)* undef
+  %rcp = fdiv arcp float %x, -10.0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
 
 declare float @llvm.fabs.f32(float) #1
 declare float @llvm.sqrt.f32(float) #1
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
index a5581d73cb25..34cbe3963361 100644
--- a/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
+++ b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
index 2617ad7402ff..6417d28e7aad 100644
--- a/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
+++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
index dcde8a1894fc..8e248fdfea4c 100644
--- a/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
+++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
@@ -4,7 +4,7 @@
 
 declare i64 @llvm.read_register.i64(metadata) #0
 
-define void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !0)
   store i64 %exec, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/read_register.ll b/test/CodeGen/AMDGPU/read_register.ll
index 601a0adb8122..8fe9e7f3f111 100644
--- a/test/CodeGen/AMDGPU/read_register.ll
+++ b/test/CodeGen/AMDGPU/read_register.ll
@@ -9,7 +9,7 @@ declare i64 @llvm.read_register.i64(metadata) #0
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], [[COPY_M0]]
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_m0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_m0(i32 addrspace(1)* %out) #0 {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @test_read_m0(i32 addrspace(1)* %out) #0 {
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_read_exec(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !1)
   store i64 %exec, i64 addrspace(1)* %out
   ret void
@@ -30,7 +30,7 @@ define void @test_read_exec(i64 addrspace(1)* %out) #0 {
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
   %flat_scratch = call i64 @llvm.read_register.i64(metadata !2)
   store i64 %flat_scratch, i64 addrspace(1)* %out
   ret void
@@ -39,7 +39,7 @@ define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_lo
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
   %flat_scratch_lo = call i32 @llvm.read_register.i32(metadata !3)
   store i32 %flat_scratch_lo, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_hi
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
   %flat_scratch_hi = call i32 @llvm.read_register.i32(metadata !4)
   store i32 %flat_scratch_hi, i32 addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_exec_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_lo
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
   %exec_lo = call i32 @llvm.read_register.i32(metadata !5)
   store i32 %exec_lo, i32 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_exec_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_hi
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
   %exec_hi = call i32 @llvm.read_register.i32(metadata !6)
   store i32 %exec_hi, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/readcyclecounter.ll b/test/CodeGen/AMDGPU/readcyclecounter.ll
index 7965b061fe5b..5c698c839fa6 100644
--- a/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -13,7 +13,7 @@ declare i64 @llvm.readcyclecounter() #0
 ; SI: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; VI: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: store_dwordx2
-define void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.readcyclecounter()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
index dd67dc488dbf..ecb513cd80b6 100644
--- a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -6,7 +6,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, [[VAL]]
 ; GCN: buffer_store_dwordx2
-define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -16,7 +16,7 @@ define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 ad
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt0:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
@@ -27,7 +27,7 @@ define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 a
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt1:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
 ; GCN: buffer_store_dword [[VAL]]
-define void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 1
diff --git a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
index 281e49f804c6..601aca48e1e2 100644
--- a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
   store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out,
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <8 x i16>
   store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4
   ret void
@@ -22,7 +22,7 @@ define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out,
 ; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
   store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
   ret void
@@ -32,7 +32,7 @@ define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <2 x i64>
   store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4
   ret void
@@ -44,7 +44,7 @@ define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out,
 ; GCN: buffer_load_ushort
 ; GCN: buffer_load_ushort
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
+define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
   %x.bc = bitcast <4 x i16> %x to <2 x i32>
   store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
index 909644850750..9f8667d35993 100644
--- a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
+++ b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
@@ -6,7 +6,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-define void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
+define amdgpu_kernel void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
 bb:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp0 = icmp eq i32 %id.x, 0
diff --git a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
new file mode 100644
index 000000000000..ecf94b5772ff
--- /dev/null
+++ b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
@@ -0,0 +1,76 @@
+# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s
+
+# Test that register coalescing does not allow a call to
+# LIS->getInstructionIndex with a DBG_VALUE instruction, which does not have
+# a slot index.
+
+# CHECK: %13.sub2 = S_MOV_B32 0
+# CHECK: DBG_VALUE{{.*}}debug-use %13.sub2
+
+--- |
+  define amdgpu_kernel void @test(i32 addrspace(1)* %out) { ret void }
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4)
+  !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6)
+  !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !5)
+  !3 = !DISubroutineType(types: !4)
+  !4 = !{null}
+  !5 = !{!1}
+  !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 32)
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !DIExpression()
+  !9 = !DILocation(line: 126, column: 9, scope: !2)
+
+...
+---
+name:            test
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vreg_64 }
+  - { id: 20, class: vreg_64 }
+liveins:         
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0:
+    liveins: %sgpr0_sgpr1, %vgpr0
+  
+    %3 = COPY killed %vgpr0
+    %0 = COPY killed %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORD_IMM killed %0, 13, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    %18 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    undef %19.sub0 = COPY killed %3
+    %19.sub1 = COPY killed %18
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    DBG_VALUE debug-use %11, debug-use _, !1, !8, debug-location !9
+    undef %12.sub0 = COPY killed %11
+    %12.sub1 = COPY killed %10
+    undef %13.sub0_sub1 = COPY killed %4
+    %13.sub2_sub3 = COPY killed %12
+    %20 = V_LSHL_B64 killed %19, 2, implicit %exec
+    %16 = COPY killed %5
+    BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out)
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll
index bff3a9f5d2b0..26a76cf2041e 100644
--- a/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI: ; Kernel info:
 ; SI: ; NumSgprs: {{[0-9]+}}
 ; SI: ; NumVgprs: {{[0-9]+}}
-define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
+define amdgpu_kernel void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
@@ -24,7 +24,7 @@ define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 a
 
 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
+define amdgpu_kernel void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
   store i32 %x, i32 addrspace(1)* %out, align 4
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/rename-disconnected-bug.ll b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
index 47bdfba96530..5d4955aa1ce2 100644
--- a/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
+++ b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
@@ -3,7 +3,7 @@
 ; definition on every path (there should at least be IMPLICIT_DEF instructions).
 target triple = "amdgcn--"
 
-define void @func() {
+define amdgpu_kernel void @func() {
 B0:
   br i1 undef, label %B1, label %B2
 
diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/test/CodeGen/AMDGPU/rename-independent-subregs.mir
index b928bc7086bb..fc2e4426ba48 100644
--- a/test/CodeGen/AMDGPU/rename-independent-subregs.mir
+++ b/test/CodeGen/AMDGPU/rename-independent-subregs.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass simple-register-coalescing,rename-independent-subregs -o - %s | FileCheck %s
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
 ...
 ---
 # In the test below we have two independent def+use pairs of subregister1 which
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
index 412202fa5d51..ff4069226a62 100644
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
@@ -19,7 +19,7 @@ define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocap
 ; SI: ds_read2_b64
 ; SI: ds_write2_b64
 ; SI: s_endpgm
-define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
@@ -39,7 +39,7 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
   %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
   store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
@@ -54,7 +54,7 @@ define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* no
 ; SI-NOT: ds_read
 ; SI: ds_write_b64
 ; SI: s_endpgm
-define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
   %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8
   %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
diff --git a/test/CodeGen/AMDGPU/ret.ll b/test/CodeGen/AMDGPU/ret.ll
index 515203fad4cb..831c71dff79d 100644
--- a/test/CodeGen/AMDGPU/ret.ll
+++ b/test/CodeGen/AMDGPU/ret.ll
@@ -1,25 +1,24 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 ; GCN-LABEL: {{^}}vgpr:
 ; GCN: v_mov_b32_e32 v1, v0
 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
-; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm
+; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  %x = fadd float %3, 1.0
-  %a = insertvalue {float, float} undef, float %x, 0
-  %b = insertvalue {float, float} %a, float %3, 1
-  ret {float, float} %b
+define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  %x = fadd float %arg3, 1.000000e+00
+  %a = insertvalue { float, float } undef, float %x, 0
+  %b = insertvalue { float, float } %a, float %arg3, 1
+  ret { float, float } %b
 }
 
 ; GCN-LABEL: {{^}}vgpr_literal:
 ; GCN: v_mov_b32_e32 v4, v0
-; GCN: exp mrt0 v4, v4, v4, v4 done compr vm
+; GCN: exp mrt0 v4, v4, v4, v4 done vm
 
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
@@ -27,12 +26,12 @@ define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 i
 ; GCN-DAG: v_mov_b32_e32 v3, -1.0
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0}
+define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  ret { float, float, float, float } { float 1.000000e+00, float 2.000000e+00, float 4.000000e+00, float -1.000000e+00 }
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@@ -44,24 +43,24 @@ define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addr
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v6
 ; GCN-NOT: s_endpgm
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
   %f0 = bitcast i32 %i0 to float
   %f1 = bitcast i32 %i1 to float
   %f2 = bitcast i32 %i2 to float
   %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 1
 ; GCN-NEXT: .long 165584
@@ -69,11 +68,11 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i
 ; GCN-LABEL: {{^}}ps_input_ena_no_inputs:
 ; GCN: v_mov_b32_e32 v0, 1.0
 ; GCN-NOT: s_endpgm
-define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-  ret float 1.0
+define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+bb:
+  ret float 1.000000e+00
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 2081
 ; GCN-NEXT: .long 165584
@@ -83,14 +82,14 @@ define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byv
 ; GCN-DAG: v_mov_b32_e32 v1, v2
 ; GCN: v_mov_b32_e32 v2, v3
 ; GCN-NOT: s_endpgm
-define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-  %f = bitcast <2 x i32> %8 to <2 x float>
-  %s = insertvalue {float, <2 x float>} undef, float %14, 0
-  %s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1
-  ret {float, <2 x float>} %s1
+define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+bb:
+  %f = bitcast <2 x i32> %arg8 to <2 x float>
+  %s = insertvalue { float, <2 x float> } undef, float %arg14, 0
+  %s1 = insertvalue { float, <2 x float> } %s, <2 x float> %f, 1
+  ret { float, <2 x float> } %s1
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@@ -102,25 +101,24 @@ define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrsp
 ; GCN-DAG: v_mov_b32_e32 v3, v6
 ; GCN-DAG: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-attributes #1 = { "InitialPSInputAddr"="1" }
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
   %f0 = bitcast i32 %i0 to float
   %f1 = bitcast i32 %i1 to float
   %f2 = bitcast i32 %i2 to float
   %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@@ -132,25 +130,24 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i
 ; GCN: v_mov_b32_e32 v3, v8
 ; GCN: v_mov_b32_e32 v4, v12
 ; GCN-NOT: s_endpgm
-attributes #2 = { "InitialPSInputAddr"="119" }
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
   %f0 = bitcast i32 %i0 to float
   %f1 = bitcast i32 %i1 to float
   %f2 = bitcast i32 %i2 to float
   %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@@ -162,38 +159,37 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-attributes #3 = { "InitialPSInputAddr"="418" }
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
   %f0 = bitcast i32 %i0 to float
   %f1 = bitcast i32 %i1 to float
   %f2 = bitcast i32 %i2 to float
   %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }
 
-
 ; GCN-LABEL: {{^}}sgpr:
 ; GCN: s_add_i32 s0, s3, 2
 ; GCN: s_mov_b32 s2, s3
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  %x = add i32 %2, 2
-  %a = insertvalue {i32, i32, i32} undef, i32 %x, 0
-  %b = insertvalue {i32, i32, i32} %a, i32 %1, 1
-  %c = insertvalue {i32, i32, i32} %a, i32 %2, 2
-  ret {i32, i32, i32} %c
+define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  %x = add i32 %arg2, 2
+  %a = insertvalue { i32, i32, i32 } undef, i32 %x, 0
+  %b = insertvalue { i32, i32, i32 } %a, i32 %arg1, 1
+  %c = insertvalue { i32, i32, i32 } %a, i32 %arg2, 2
+  ret { i32, i32, i32 } %c
 }
 
-
 ; GCN-LABEL: {{^}}sgpr_literal:
 ; GCN: s_mov_b32 s0, 5
 ; GCN-NOT: s_mov_b32 s0, s0
@@ -201,37 +197,37 @@ define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32
 ; GCN-DAG: s_mov_b32 s2, 7
 ; GCN-DAG: s_mov_b32 s3, 8
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  %x = add i32 %2, 2
-  ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8}
+define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  %x = add i32 %arg2, 2
+  ret { i32, i32, i32, i32 } { i32 5, i32 6, i32 7, i32 8 }
 }
 
-
 ; GCN-LABEL: {{^}}both:
 ; GCN: v_mov_b32_e32 v1, v0
-; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm
+; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
 ; GCN-DAG: s_add_i32 s0, s3, 2
 ; GCN-DAG: s_mov_b32 s1, s2
 ; GCN: s_mov_b32 s2, s3
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  %v = fadd float %3, 1.0
-  %s = add i32 %2, 2
-  %a0 = insertvalue {float, i32, float, i32, i32} undef, float %v, 0
-  %a1 = insertvalue {float, i32, float, i32, i32} %a0, i32 %s, 1
-  %a2 = insertvalue {float, i32, float, i32, i32} %a1, float %3, 2
-  %a3 = insertvalue {float, i32, float, i32, i32} %a2, i32 %1, 3
-  %a4 = insertvalue {float, i32, float, i32, i32} %a3, i32 %2, 4
-  ret {float, i32, float, i32, i32} %a4
+define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  %v = fadd float %arg3, 1.000000e+00
+  %s = add i32 %arg2, 2
+  %a0 = insertvalue { float, i32, float, i32, i32 } undef, float %v, 0
+  %a1 = insertvalue { float, i32, float, i32, i32 } %a0, i32 %s, 1
+  %a2 = insertvalue { float, i32, float, i32, i32 } %a1, float %arg3, 2
+  %a3 = insertvalue { float, i32, float, i32, i32 } %a2, i32 %arg1, 3
+  %a4 = insertvalue { float, i32, float, i32, i32 } %a3, i32 %arg2, 4
+  ret { float, i32, float, i32, i32 } %a4
 }
 
-
 ; GCN-LABEL: {{^}}structure_literal:
 ; GCN: v_mov_b32_e32 v3, v0
-; GCN: exp mrt0 v3, v3, v3, v3 done compr vm
+; GCN: exp mrt0 v3, v3, v3, v3 done vm
 
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: s_mov_b32 s0, 2
@@ -239,9 +235,16 @@ define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
 ; GCN: s_waitcnt expcnt(0)
-define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
+define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  ret { { float, i32 }, { i32, <2 x float> } } { { float, i32 } { float 1.000000e+00, i32 2 }, { i32, <2 x float> } { i32 3, <2 x float> <float 2.000000e+00, float 4.000000e+00> } }
 }
 
-attributes #0 = { nounwind "InitialPSInputAddr"="0" }
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "InitialPSInputAddr"="0" }
+attributes #2 = { nounwind "InitialPSInputAddr"="1" }
+attributes #3 = { nounwind "InitialPSInputAddr"="119" }
+attributes #4 = { nounwind "InitialPSInputAddr"="418" }
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll
index 51ca60492414..f2fbacbab82e 100644
--- a/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -4,24 +4,86 @@
 ; This should end with an no-op sequence of exec mask manipulations
 ; Mask should be in original state after executed unreachable block
 
-; GCN-LABEL: {{^}}main:
+
+; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable:
 ; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
 
+; GCN-NEXT: ; %else
+
 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
-; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
 
-; GCN: [[RET_BB]]:
-; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
+; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
+; GCN-NEXT: ; divergent unreachable
 
-; GCN-NEXT: [[UNREACHABLE_BB]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[XOR_EXEC]]
-; GCN-NEXT: [[FINAL_BB]]:
+; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow
+; GCN-NEXT: s_or_b64 exec, exec
+
+; GCN-NEXT: [[RET_BB]]:
+; GCN-NEXT: ; return
 ; GCN-NEXT: .Lfunc_end0
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+entry:
+  %i.i = extractelement <2 x i32> %arg7, i32 0
+  %j.i = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p87 = fmul float undef, %p2.i
+  %p88 = fadd float %p87, undef
+  %p93 = fadd float %p88, undef
+  %p97 = fmul float %p93, undef
+  %p102 = fsub float %p97, undef
+  %p104 = fmul float %p102, undef
+  %p106 = fadd float 0.000000e+00, %p104
+  %p108 = fadd float undef, %p106
+  %uniform.cond = icmp slt i32 %arg17, 0
+  br i1 %uniform.cond, label %ret.bb, label %else
+
+else:                                             ; preds = %main_body
+  %p124 = fmul float %p108, %p108
+  %p125 = fsub float %p124, undef
+  %divergent.cond = fcmp olt float %p125, 0.000000e+00
+  br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
+
+unreachable.bb:                                           ; preds = %else
+  unreachable
+
+ret.bb:                                          ; preds = %else, %main_body
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
+}
+
+; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: ; BB#{{[0-9]+}}: ; %else
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT:  ; %unreachable.bb
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+; GCN: ; divergent unreachable
+
+; GCN: ; %ret.bb
+; GCN: store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+; GCN-NEXT: .Lfunc_end
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
 main_body:
-  %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
-  %p87 = fmul float undef, %p83
+  %i.i = extractelement <2 x i32> %arg7, i32 0
+  %j.i = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p87 = fmul float undef, %p2.i
   %p88 = fadd float %p87, undef
   %p93 = fadd float %p88, undef
   %p97 = fmul float %p93, undef
@@ -29,26 +91,35 @@ main_body:
   %p104 = fmul float %p102, undef
   %p106 = fadd float 0.000000e+00, %p104
   %p108 = fadd float undef, %p106
-  br i1 undef, label %ENDIF69, label %ELSE
+  %uniform.cond = icmp slt i32 %arg18, 0
+  br i1 %uniform.cond, label %ret.bb, label %else
 
-ELSE:                                             ; preds = %main_body
+else:                                             ; preds = %main_body
   %p124 = fmul float %p108, %p108
   %p125 = fsub float %p124, undef
-  %p126 = fcmp olt float %p125, 0.000000e+00
-  br i1 %p126, label %ENDIF69, label %ELSE41
+  %divergent.cond = fcmp olt float %p125, 0.000000e+00
+  br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
 
-ELSE41:                                           ; preds = %ELSE
+unreachable.bb:                                           ; preds = %else
+  store volatile i32 8, i32 addrspace(3)* undef
   unreachable
 
-ENDIF69:                                          ; preds = %ELSE, %main_body
+ret.bb:                                          ; preds = %else, %main_body
+  store volatile i32 11, i32 addrspace(1)* undef
   ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
 }
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare float @llvm.fabs.f32(float) #1
@@ -61,3 +132,4 @@ declare float @llvm.floor.f32(float) #1
 
 attributes #0 = { "InitialPSInputAddr"="36983" }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll
index b60c470de97c..266490718dd1 100644
--- a/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -7,7 +7,7 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH: s_or_b64
 ; BOTH: s_endpgm
-define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %0 = shl i64 %x, %y
   %1 = sub i64 64, %y
@@ -26,7 +26,7 @@ entry:
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
 ; BOTH: s_endpgm
-define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64, i64 addrspace(1)* %xptr, align 8
   %y = load i64, i64 addrspace(1)* %yptr, align 8
diff --git a/test/CodeGen/AMDGPU/rotl.ll b/test/CodeGen/AMDGPU/rotl.ll
index 7d2b5538ca33..c4bc8cdaabf5 100644
--- a/test/CodeGen/AMDGPU/rotl.ll
+++ b/test/CodeGen/AMDGPU/rotl.ll
@@ -10,7 +10,7 @@
 ; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
 ; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
 ; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
-define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
   %1 = sub i32 32, %y
@@ -26,7 +26,7 @@ entry:
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %0 = shl <2 x i32> %x, %y
   %1 = sub <2 x i32> <i32 32, i32 32>, %y
@@ -46,7 +46,7 @@ entry:
 ; SI-DAG: s_sub_i32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %0 = shl <4 x i32> %x, %y
   %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll
index 58a1efe08079..9eda479cd25c 100644
--- a/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -6,7 +6,7 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH-DAG: s_lshl_b64
 ; BOTH: s_or_b64
-define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %tmp0 = sub i64 64, %y
   %tmp1 = shl i64 %x, %tmp0
@@ -24,7 +24,7 @@ entry:
 ; VI-DAG: v_lshlrev_b64
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
-define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64, i64 addrspace(1)* %xptr, align 8
   %y = load i64, i64 addrspace(1)* %yptr, align 8
@@ -37,7 +37,7 @@ entry:
 }
 
 ; BOTH-LABEL: {{^}}s_rotr_v2i64:
-define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
 entry:
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
   %tmp1 = shl <2 x i64> %x, %tmp0
@@ -48,7 +48,7 @@ entry:
 }
 
 ; BOTH-LABEL: {{^}}v_rotr_v2i64:
-define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
 entry:
   %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8
   %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8
diff --git a/test/CodeGen/AMDGPU/rotr.ll b/test/CodeGen/AMDGPU/rotr.ll
index 55d180077cc7..b4e2c2b67ce1 100644
--- a/test/CodeGen/AMDGPU/rotr.ll
+++ b/test/CodeGen/AMDGPU/rotr.ll
@@ -6,7 +6,7 @@
 ; R600: BIT_ALIGN_INT
 
 ; SI: v_alignbit_b32
-define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %tmp0 = sub i32 32, %y
   %tmp1 = shl i32 %x, %tmp0
@@ -22,7 +22,7 @@ entry:
 
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
   %tmp1 = shl <2 x i32> %x, %tmp0
@@ -42,7 +42,7 @@ entry:
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
   %tmp1 = shl <4 x i32> %x, %tmp0
diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll
index 699440c3efbf..9462683efe0e 100644
--- a/test/CodeGen/AMDGPU/rsq.ll
+++ b/test/CodeGen/AMDGPU/rsq.ll
@@ -8,7 +8,7 @@ declare double @llvm.sqrt.f64(double) nounwind readnone
 ; SI-LABEL: {{^}}rsq_f32:
 ; SI: v_rsq_f32_e32
 ; SI: s_endpgm
-define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv float 1.0, %sqrt
@@ -20,7 +20,7 @@ define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noali
 ; SI-UNSAFE: v_rsq_f64_e32
 ; SI-SAFE: v_sqrt_f64_e32
 ; SI: s_endpgm
-define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
   %div = fdiv double 1.0, %sqrt
@@ -31,7 +31,7 @@ define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noa
 ; SI-LABEL: {{^}}rsq_f32_sgpr:
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
 ; SI: s_endpgm
-define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
+define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv float 1.0, %sqrt
   store float %div, float addrspace(1)* %out, align 4
@@ -55,7 +55,7 @@ define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind
 ; SI-SAFE-NOT: v_rsq_f32
 
 ; SI: s_endpgm
-define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -81,7 +81,7 @@ define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
 ; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}}
 ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
 ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
-define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val)
   %div = fdiv float -1.0, %sqrt
@@ -96,7 +96,7 @@ define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* n
 ; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}
 ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
 ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
-define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val)
   %div = fdiv double -1.0, %sqrt
@@ -112,7 +112,7 @@ define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)*
 ; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}}
 ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
 ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
-define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %val.fneg = fsub float -0.0, %val
   %sqrt = call float @llvm.sqrt.f32(float %val.fneg)
@@ -128,7 +128,7 @@ define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}}
 ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
 ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
-define void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %val.fneg = fsub double -0.0, %val
   %sqrt = call double @llvm.sqrt.f64(double %val.fneg)
diff --git a/test/CodeGen/AMDGPU/runtime-metadata.ll b/test/CodeGen/AMDGPU/runtime-metadata.ll
deleted file mode 100644
index abdbc325fd4d..000000000000
--- a/test/CodeGen/AMDGPU/runtime-metadata.ll
+++ /dev/null
@@ -1,396 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -amdgpu-dump-rtmd -amdgpu-check-rtmd-parser %s -o - 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=PARSER %s
-
-%struct.A = type { i8, float }
-%opencl.image1d_t = type opaque
-%opencl.image2d_t = type opaque
-%opencl.image3d_t = type opaque
-%opencl.queue_t = type opaque
-%opencl.pipe_t = type opaque
-%struct.B = type { i32 addrspace(1)*}
-%opencl.clk_event_t = type opaque
-
-; CHECK: ---
-; CHECK-NEXT: { amd.MDVersion: [ 2, 0 ], amd.PrintfInfo: [ '1:1:4:%d\n', '2:1:8:%g\n' ], amd.Kernels: 
-
-; CHECK-NEXT:   - { amd.KernelName: test_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 1, amd.ArgAlign: 1, amd.ArgKind: 0, amd.ArgValueType: 1, amd.ArgTypeName: char, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_char(i8 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_ushort2, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 4, amd.ArgTypeName: ushort2, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_ushort2(<2 x i16> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_int3, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 16, amd.ArgAlign: 16, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_int3(<3 x i32> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_ulong4, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 32, amd.ArgAlign: 32, amd.ArgKind: 0, amd.ArgValueType: 10, amd.ArgTypeName: ulong4, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_ulong4(<4 x i64> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_half8, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 16, amd.ArgAlign: 16, amd.ArgKind: 0, amd.ArgValueType: 5, amd.ArgTypeName: half8, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_half8(<8 x half> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_float16, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 64, amd.ArgAlign: 64, amd.ArgKind: 0, amd.ArgValueType: 8, amd.ArgTypeName: float16, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_float16(<16 x float> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_double16, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 128, amd.ArgAlign: 128, amd.ArgKind: 0, amd.ArgValueType: 11, amd.ArgTypeName: double16, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_double16(<16 x double> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_pointer, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_image, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image2d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17 !kernel_arg_base_type !17 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_sampler, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 3, amd.ArgValueType: 6, amd.ArgTypeName: sampler_t, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_sampler(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_queue, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 6, amd.ArgValueType: 0, amd.ArgTypeName: queue_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_struct, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 1, amd.ArgValueType: 0, amd.ArgTypeName: struct A, amd.ArgAddrQual: 0, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_struct(%struct.A* byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 !kernel_arg_base_type !20 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_i128, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 16, amd.ArgAlign: 8, amd.ArgKind: 0, amd.ArgValueType: 0, amd.ArgTypeName: i128, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_i128(i128 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_multi_arg, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 3, amd.ArgTypeName: short2, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 1, amd.ArgTypeName: char3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24 !kernel_arg_base_type !24 !kernel_arg_type_qual !25 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_addr_space, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 2, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 4, amd.ArgKind: 2, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g, i32 addrspace(2)* %c, i32 addrspace(3)* %l) !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_type_qual, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0, amd.ArgIsVolatile: 1 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0, amd.ArgIsConst: 1, amd.ArgIsRestrict: 1 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 5, amd.ArgValueType: 0, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0, amd.ArgIsPipe: 1 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a, i32 addrspace(1)* %b, %opencl.pipe_t addrspace(1)* %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !70 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_access_qual, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image1d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 1 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image2d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 2 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image3d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 3 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro, %opencl.image2d_t addrspace(1)* %wo, %opencl.image3d_t addrspace(1)* %rw) !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62 !kernel_arg_base_type !62 !kernel_arg_type_qual !25 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_half, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: half, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_half(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !26 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_float, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: float, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_float(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !27 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_double, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: double, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_double(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !28 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: char, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_char(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !29 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_short, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: short, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_short(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !30 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_long, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: long, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_long(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !31 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_unknown, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: unknown, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_unknown(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !32 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_reqd_wgs_vec_type_hint, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.ReqdWorkGroupSize: [ 1, 2, 4 ], amd.VecTypeHint: int, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5 !reqd_work_group_size !6 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_wgs_hint_vec_type_hint, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.WorkGroupSizeHint: [ 8, 16, 32 ], amd.VecTypeHint: uint4, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7 !work_group_size_hint !8 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_arg_ptr_to_ptr, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int **', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_arg_ptr_to_ptr(i32 * addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80 !kernel_arg_base_type !80 !kernel_arg_type_qual !4 {
-  ret void
-}
-; CHECK-NEXT:   - { amd.KernelName: test_arg_struct_contains_ptr, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 1, amd.ArgValueType: 0, amd.ArgTypeName: struct B, amd.ArgAddrQual: 0, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B * byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82 !kernel_arg_base_type !82 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_arg_vector_of_ptr, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 16, amd.ArgAlign: 16, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: 'global int* __attribute__((ext_vector_type(2)))', amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83 !kernel_arg_base_type !83 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-
-; CHECK-NEXT:   - { amd.KernelName: test_arg_unknown_builtin_type, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 0, amd.ArgTypeName: clk_event_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_arg_unknown_builtin_type(%opencl.clk_event_t addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84 !kernel_arg_base_type !84 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_pointee_align, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 9, amd.ArgTypeName: 'long *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 1, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 2, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char2 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 4, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char3 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 4, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char4 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 8, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char8 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 16, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char16 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } }
-define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, i8 addrspace(3)* %b, <2 x i8> addrspace(3)* %c, <3 x i8> addrspace(3)* %d, <4 x i8> addrspace(3)* %e, <8 x i8> addrspace(3)* %f, <16 x i8> addrspace(3)* %g) !kernel_arg_addr_space !91 !kernel_arg_access_qual !92 !kernel_arg_type !93 !kernel_arg_base_type !93 !kernel_arg_type_qual !94 {
-  ret void
-}
-
-; CHECK-NEXT:...
-
-; PARSER: AMDGPU runtime metadata parser test passes.
-
-!llvm.printf.fmts = !{!100, !101}
-
-!1 = !{i32 0}
-!2 = !{!"none"}
-!3 = !{!"int"}
-!4 = !{!""}
-!5 = !{i32 undef, i32 1}
-!6 = !{i32 1, i32 2, i32 4}
-!7 = !{<4 x i32> undef, i32 0}
-!8 = !{i32 8, i32 16, i32 32}
-!9 = !{!"char"}
-!10 = !{!"ushort2"}
-!11 = !{!"int3"}
-!12 = !{!"ulong4"}
-!13 = !{!"half8"}
-!14 = !{!"float16"}
-!15 = !{!"double16"}
-!16 = !{!"int *"}
-!17 = !{!"image2d_t"}
-!18 = !{!"sampler_t"}
-!19 = !{!"queue_t"}
-!20 = !{!"struct A"}
-!21 = !{!"i128"}
-!22 = !{i32 0, i32 0, i32 0}
-!23 = !{!"none", !"none", !"none"}
-!24 = !{!"int", !"short2", !"char3"}
-!25 = !{!"", !"", !""}
-!26 = !{half undef, i32 1}
-!27 = !{float undef, i32 1}
-!28 = !{double undef, i32 1}
-!29 = !{i8 undef, i32 1}
-!30 = !{i16 undef, i32 1}
-!31 = !{i64 undef, i32 1}
-!32 = !{i32 *undef, i32 1}
-!50 = !{i32 1, i32 2, i32 3}
-!51 = !{!"int *", !"int *", !"int *"}
-!60 = !{i32 1, i32 1, i32 1}
-!61 = !{!"read_only", !"write_only", !"read_write"}
-!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"}
-!70 = !{!"volatile", !"const restrict", !"pipe"}
-!80 = !{!"int **"}
-!81 = !{i32 1}
-!82 = !{!"struct B"}
-!83 = !{!"global int* __attribute__((ext_vector_type(2)))"}
-!84 = !{!"clk_event_t"}
-!opencl.ocl.version = !{!90}
-!90 = !{i32 2, i32 0}
-!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3}
-!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"}
-!93 = !{!"long *", !"char *", !"char2 *", !"char3 *", !"char4 *", !"char8 *", !"char16 *"}
-!94 = !{!"", !"", !"", !"", !"", !"", !""}
-!100 = !{!"1:1:4:%d\5Cn"}
-!101 = !{!"2:1:8:%g\5Cn"}
diff --git a/test/CodeGen/AMDGPU/s_addk_i32.ll b/test/CodeGen/AMDGPU/s_addk_i32.ll
index f776faca8397..deef24cea377 100644
--- a/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 65
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
 ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
 ; SI: s_endpgm
-define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
   %add0 = add i32 %a, 65
   %add1 = add i32 %b, 65
   store i32 %add0, i32 addrspace(1)* %out0
@@ -30,26 +30,35 @@ define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1,
 ; SI-LABEL: {{^}}s_addk_i32_k1:
 ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 32767 ; (1 << 15) - 1
   store i32 %add, i32 addrspace(1)* %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_addk_i32_k2:
-; SI: s_addk_i32 {{s[0-9]+}}, 0xffef{{$}}
+; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17
 ; SI: s_endpgm
-define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, -17
   store i32 %add, i32 addrspace(1)* %out
   ret void
 }
 
+; SI-LABEL: {{^}}s_addk_i32_k3:
+; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}}
+; SI: s_endpgm
+define amdgpu_kernel void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
+  %add = add i32 %b, -65
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
+
 ; SI-LABEL: {{^}}s_addk_v2i32_k0:
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
 ; SI: s_endpgm
-define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
+define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
   %add = add <2 x i32> %b, <i32 65, i32 66>
   store <2 x i32> %add, <2 x i32> addrspace(1)* %out
   ret void
@@ -61,7 +70,7 @@ define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44
 ; SI: s_endpgm
-define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
+define amdgpu_kernel void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
   %add = add <4 x i32> %b, <i32 65, i32 66, i32 67, i32 68>
   store <4 x i32> %add, <4 x i32> addrspace(1)* %out
   ret void
@@ -77,7 +86,7 @@ define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48
 ; SI: s_endpgm
-define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
+define amdgpu_kernel void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
   %add = add <8 x i32> %b, <i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72>
   store <8 x i32> %add, <8 x i32> addrspace(1)* %out
   ret void
@@ -86,7 +95,7 @@ define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
 ; SI-LABEL: {{^}}no_s_addk_i32_k0:
 ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}}
 ; SI: s_endpgm
-define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 32768 ; 1 << 15
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -96,7 +105,7 @@ define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 
 ; SI-LABEL: {{^}}commute_s_addk_i32:
 ; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = add i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll
index 0164c45083a2..a131aaa3dfb4 100644
--- a/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -49,7 +49,7 @@ define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -63,7 +63,7 @@ define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
   store i64 %or, i64 addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -107,7 +107,7 @@ define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -122,7 +122,7 @@ define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
   store i64 %or, i64 addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
   store i64 %or, i64 addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
   store i64 %or, i64 addrspace(1)* %out
@@ -167,7 +167,7 @@ define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
   store i64 %or, i64 addrspace(1)* %out
@@ -182,7 +182,7 @@ define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
   store i64 %or, i64 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/s_mulk_i32.ll b/test/CodeGen/AMDGPU/s_mulk_i32.ll
index e83b368cc1cb..f6ed5408ba45 100644
--- a/test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 65
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_mulk_i32_k1:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 32767 ; (1 << 15) - 1
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -25,7 +25,7 @@ define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_mulk_i32_k2:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}}
 ; SI: s_endpgm
-define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, -17
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}no_s_mulk_i32_k0:
 ; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
 ; SI: s_endpgm
-define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 32769 ; 1 << 15 + 1
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -44,7 +44,7 @@ define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 
 ; SI-LABEL: {{^}}commute_s_mulk_i32:
 ; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = mul i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
diff --git a/test/CodeGen/AMDGPU/sad.ll b/test/CodeGen/AMDGPU/sad.ll
index 534483401638..f7a1c65881d0 100644
--- a/test/CodeGen/AMDGPU/sad.ll
+++ b/test/CodeGen/AMDGPU/sad.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -18,7 +18,7 @@ define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 
 ; GCN-LABEL: {{^}}v_sad_u32_constant_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20
-define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
   %icmp0 = icmp ugt i32 %a, 90
   %t0 = select i1 %icmp0, i32 %a, i32 90
 
@@ -34,7 +34,7 @@ define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -51,7 +51,7 @@ define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 ; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -68,7 +68,7 @@ define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -84,7 +84,7 @@ define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
   store volatile i32 %t0, i32 *undef
@@ -101,7 +101,7 @@ define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -119,7 +119,7 @@ define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   store volatile i32 %sub0, i32 *undef
@@ -136,7 +136,7 @@ define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -154,7 +154,7 @@ define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b
 
@@ -173,7 +173,7 @@ define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %sub0 = sub <4 x i32> %a, %b
   %sub1 = sub <4 x i32> %b, %a
@@ -187,7 +187,7 @@ define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 
   %icmp0 = icmp ugt i16 %a, %b
   %t0 = select i1 %icmp0, i16 %a, i16 %b
@@ -204,7 +204,7 @@ define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c)
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
   %icmp0 = icmp ugt i16 %a, %b
   %sub0 = sub i16 %a, %b
   %sub1 = sub i16 %b, %a
@@ -218,7 +218,7 @@ define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zero
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %t0 = select i1 %icmp0, i8 %a, i8 %b
 
@@ -234,7 +234,7 @@ define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
@@ -251,7 +251,7 @@ define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext
 ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -269,7 +269,7 @@ define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a,
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %d
   %sub1 = sub i32 %b, %a
diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll
index f8ced7942a60..586a455b2b91 100644
--- a/test/CodeGen/AMDGPU/saddo.ll
+++ b/test/CodeGen/AMDGPU/saddo.ll
@@ -6,7 +6,7 @@ declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 
 ; FUNC-LABEL: {{^}}saddo_i64_zext:
-define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
@@ -17,7 +17,7 @@ define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}s_saddo_i32:
-define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
   %carry = extractvalue { i32, i1 } %sadd, 1
@@ -27,7 +27,7 @@ define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}v_saddo_i32:
-define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -39,7 +39,7 @@ define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}s_saddo_i64:
-define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
@@ -51,7 +51,7 @@ define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_saddo_i64:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 4
   %b = load i64, i64 addrspace(1)* %bptr, align 4
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index 37083fbbd3c5..6e1dd1638333 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -24,7 +24,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() #0
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 
-define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
@@ -55,17 +55,17 @@ done:                                             ; preds = %loop
 
 ; GCN-LABEL: {{^}}smrd_valu:
 ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
+; SI: s_mov_b32
 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
 ; SI: s_nop 3
 ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
-; SI: s_mov_b32
 
 ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
-define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
 entry:
   %tmp = icmp ne i32 %a, 0
   br i1 %tmp, label %if, label %else
@@ -93,7 +93,7 @@ endif:                                            ; preds = %else, %if
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -113,7 +113,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
-define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp
@@ -133,7 +133,7 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp
@@ -155,7 +155,7 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp
@@ -189,7 +189,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp
@@ -230,7 +230,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; GCN: s_endpgm
-define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp
@@ -247,7 +247,7 @@ entry:
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
 ; GCN-NOHSA: buffer_store_dword [[ADD]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
-define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
+define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -261,7 +261,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
 ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
-define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -275,7 +275,7 @@ entry:
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -290,7 +290,7 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -313,7 +313,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -350,7 +350,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -385,7 +385,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -439,9 +439,9 @@ entry:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-NOHSA: buffer_store_dword [[ONE]]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
-; GCN; {{^}}[[EXIT]]:
+; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
-define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 bb3:                                              ; preds = %bb2
   %tmp0 = bitcast i32 %cond to float
   %tmp1 = fadd float %tmp0, 2.500000e-01
@@ -459,7 +459,7 @@ bb7:                                              ; preds = %bb3
 
 ; GCN-LABEL: {{^}}phi_visit_order:
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}}
-define void @phi_visit_order() {
+define amdgpu_kernel void @phi_visit_order() {
 bb:
   br label %bb1
 
@@ -484,7 +484,7 @@ bb4:
 ; GCN: [[LOOP_LABEL:[0-9a-zA-Z_]+]]:
 ; GCN: s_xor_b32 [[B]], [[B]], [[A]]
 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]]
-define void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
+define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
 entry:
   br label %loop
 
diff --git a/test/CodeGen/AMDGPU/sampler-resource-id.ll b/test/CodeGen/AMDGPU/sampler-resource-id.ll
index c41d345369bf..4ea503bf6098 100644
--- a/test/CodeGen/AMDGPU/sampler-resource-id.ll
+++ b/test/CodeGen/AMDGPU/sampler-resource-id.ll
@@ -5,7 +5,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_0(i32 %in0, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_0(i32 %in0, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in0) #0
   store i32 %0, i32 addrspace(1)* %out
@@ -17,7 +17,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in1) #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in2) #0
   store i32 %0, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir b/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
index af71086e542f..5bee36d878eb 100644
--- a/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
+++ b/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
@@ -1,23 +1,23 @@
 # RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s
 
 --- |
-  define void @basic_insert_dcache_wb() {
+  define amdgpu_kernel void @basic_insert_dcache_wb() {
     ret void
   }
 
-  define void @explicit_flush_after() {
+  define amdgpu_kernel void @explicit_flush_after() {
     ret void
   }
 
-  define void @explicit_flush_before() {
+  define amdgpu_kernel void @explicit_flush_before() {
     ret void
   }
 
-  define void @no_scalar_store() {
+  define amdgpu_kernel void @no_scalar_store() {
     ret void
   }
 
-  define void @multi_block_store() {
+  define amdgpu_kernel void @multi_block_store() {
   bb0:
     br i1 undef, label %bb1, label %bb2
 
@@ -28,7 +28,7 @@
     ret void
   }
 
-  define void @one_block_store() {
+  define amdgpu_kernel void @one_block_store() {
   bb0:
     br i1 undef, label %bb1, label %bb2
 
@@ -169,5 +169,5 @@ tracksRegLiveness: false
 body: |
   bb.0:
     S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
-    SI_RETURN undef %vgpr0
+    SI_RETURN_TO_EPILOG undef %vgpr0
 ...
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 32df16778a91..62d0d9367885 100644
--- a/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; XXX - Why the packing?
-; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
-; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
-; SI: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
-; SI: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
-; SI: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
-define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+; GCN-LABEL: {{^}}scalar_to_vector_v2i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
+; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
+; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
+define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -17,11 +17,11 @@ define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(
   ret void
 }
 
-; FUNC-LABEL: {{^}}scalar_to_vector_v2f32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: buffer_store_dwordx2
-define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+; GCN-LABEL: {{^}}scalar_to_vector_v2f32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; GCN: buffer_store_dwordx2
+define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tmp1 = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -33,7 +33,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ; to produce one, but for some reason never made it to selection.
 
 
-; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
 ;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
 ;   %bc = bitcast i32 %tmp1 to <4 x i8>
 
@@ -42,7 +42,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
 ;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
 ;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
@@ -51,7 +51,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
 ;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
@@ -59,7 +59,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
 ;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
@@ -67,10 +67,9 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind {
-;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
-;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
-;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
-;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
-;   ret void
-; }
+define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind {
+  %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
+  %bc = bitcast <4 x i8> %newvec0 to <2 x half>
+  store <2 x half> %bc, <2 x half> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
index e040639a2d94..60abd83546d3 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
@@ -1,81 +1,85 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
+; REQUIRES: asserts
 
-define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = fcmp ult float %1, 0.000000e+00
-  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
-  %6 = fsub float -0.000000e+00, %5
-  %7 = fptosi float %6 to i32
-  %8 = bitcast i32 %7 to float
-  %9 = fcmp ult float %0, 5.700000e+01
-  %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00
-  %11 = fsub float -0.000000e+00, %10
-  %12 = fptosi float %11 to i32
-  %13 = bitcast i32 %12 to float
-  %14 = bitcast float %8 to i32
-  %15 = bitcast float %13 to i32
-  %16 = and i32 %14, %15
-  %17 = bitcast i32 %16 to float
-  %18 = bitcast float %17 to i32
-  %19 = icmp ne i32 %18, 0
-  %20 = fcmp ult float %0, 0.000000e+00
-  %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00
-  %22 = fsub float -0.000000e+00, %21
-  %23 = fptosi float %22 to i32
-  %24 = bitcast i32 %23 to float
-  %25 = bitcast float %24 to i32
-  %26 = icmp ne i32 %25, 0
-  br i1 %19, label %IF, label %ELSE
+  %tmp = extractelement <4 x float> %reg1, i32 0
+  %tmp5 = extractelement <4 x float> %reg1, i32 1
+  %tmp6 = extractelement <4 x float> %reg1, i32 2
+  %tmp7 = extractelement <4 x float> %reg1, i32 3
+  %tmp8 = fcmp ult float %tmp5, 0.000000e+00
+  %tmp9 = select i1 %tmp8, float 1.000000e+00, float 0.000000e+00
+  %tmp10 = fsub float -0.000000e+00, %tmp9
+  %tmp11 = fptosi float %tmp10 to i32
+  %tmp12 = bitcast i32 %tmp11 to float
+  %tmp13 = fcmp ult float %tmp, 5.700000e+01
+  %tmp14 = select i1 %tmp13, float 1.000000e+00, float 0.000000e+00
+  %tmp15 = fsub float -0.000000e+00, %tmp14
+  %tmp16 = fptosi float %tmp15 to i32
+  %tmp17 = bitcast i32 %tmp16 to float
+  %tmp18 = bitcast float %tmp12 to i32
+  %tmp19 = bitcast float %tmp17 to i32
+  %tmp20 = and i32 %tmp18, %tmp19
+  %tmp21 = bitcast i32 %tmp20 to float
+  %tmp22 = bitcast float %tmp21 to i32
+  %tmp23 = icmp ne i32 %tmp22, 0
+  %tmp24 = fcmp ult float %tmp, 0.000000e+00
+  %tmp25 = select i1 %tmp24, float 1.000000e+00, float 0.000000e+00
+  %tmp26 = fsub float -0.000000e+00, %tmp25
+  %tmp27 = fptosi float %tmp26 to i32
+  %tmp28 = bitcast i32 %tmp27 to float
+  %tmp29 = bitcast float %tmp28 to i32
+  %tmp30 = icmp ne i32 %tmp29, 0
+  br i1 %tmp23, label %IF, label %ELSE
 
 IF:                                               ; preds = %main_body
-  %. = select i1 %26, float 0.000000e+00, float 1.000000e+00
-  %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00
+  %. = select i1 %tmp30, float 0.000000e+00, float 1.000000e+00
+  %.18 = select i1 %tmp30, float 1.000000e+00, float 0.000000e+00
   br label %ENDIF
 
 ELSE:                                             ; preds = %main_body
-  br i1 %26, label %ENDIF, label %ELSE17
+  br i1 %tmp30, label %ENDIF, label %ELSE17
 
 ENDIF:                                            ; preds = %ELSE17, %ELSE, %IF
-  %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
-  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
-  %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
-  %27 = call float @llvm.AMDGPU.clamp.f32(float %temp.0, float 0.000000e+00, float 1.000000e+00)
-  %28 = call float @llvm.AMDGPU.clamp.f32(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
-  %29 = call float @llvm.AMDGPU.clamp.f32(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
-  %30 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %31 = insertelement <4 x float> undef, float %27, i32 0
-  %32 = insertelement <4 x float> %31, float %28, i32 1
-  %33 = insertelement <4 x float> %32, float %29, i32 2
-  %34 = insertelement <4 x float> %33, float %30, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %34, i32 0, i32 0)
+  %temp1.0 = phi float [ %., %IF ], [ %tmp48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %tmp49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
+  %temp.0 = phi float [ %.18, %IF ], [ %tmp47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %max.0.i = call float @llvm.maxnum.f32(float %temp.0, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %temp1.0, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %temp2.0, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp31 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp32 = insertelement <4 x float> %tmp31, float %clamp.i4, i32 1
+  %tmp33 = insertelement <4 x float> %tmp32, float %clamp.i2, i32 2
+  %tmp34 = insertelement <4 x float> %tmp33, float 1.000000e+00, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp34, i32 0, i32 0)
   ret void
 
 ELSE17:                                           ; preds = %ELSE
-  %35 = fadd float 0.000000e+00, 0x3FC99999A0000000
-  %36 = fadd float 0.000000e+00, 0x3FC99999A0000000
-  %37 = fadd float 0.000000e+00, 0x3FC99999A0000000
-  %38 = fadd float %35, 0x3FC99999A0000000
-  %39 = fadd float %36, 0x3FC99999A0000000
-  %40 = fadd float %37, 0x3FC99999A0000000
-  %41 = fadd float %38, 0x3FC99999A0000000
-  %42 = fadd float %39, 0x3FC99999A0000000
-  %43 = fadd float %40, 0x3FC99999A0000000
-  %44 = fadd float %41, 0x3FC99999A0000000
-  %45 = fadd float %42, 0x3FC99999A0000000
-  %46 = fadd float %43, 0x3FC99999A0000000
-  %47 = fadd float %44, 0x3FC99999A0000000
-  %48 = fadd float %45, 0x3FC99999A0000000
-  %49 = fadd float %46, 0x3FC99999A0000000
+  %tmp35 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp36 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp37 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp38 = fadd float %tmp35, 0x3FC99999A0000000
+  %tmp39 = fadd float %tmp36, 0x3FC99999A0000000
+  %tmp40 = fadd float %tmp37, 0x3FC99999A0000000
+  %tmp41 = fadd float %tmp38, 0x3FC99999A0000000
+  %tmp42 = fadd float %tmp39, 0x3FC99999A0000000
+  %tmp43 = fadd float %tmp40, 0x3FC99999A0000000
+  %tmp44 = fadd float %tmp41, 0x3FC99999A0000000
+  %tmp45 = fadd float %tmp42, 0x3FC99999A0000000
+  %tmp46 = fadd float %tmp43, 0x3FC99999A0000000
+  %tmp47 = fadd float %tmp44, 0x3FC99999A0000000
+  %tmp48 = fadd float %tmp45, 0x3FC99999A0000000
+  %tmp49 = fadd float %tmp46, 0x3FC99999A0000000
   br label %ENDIF
 }
 
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
 
 declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
index f907e154f962..177957c0b35b 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -1,88 +1,91 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
+; REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() #0 {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(9)* null
-  %1 = extractelement <4 x float> %0, i32 3
-  %2 = fptosi float %1 to i32
-  %3 = bitcast i32 %2 to float
-  %4 = bitcast float %3 to i32
-  %5 = sdiv i32 %4, 4
-  %6 = bitcast i32 %5 to float
-  %7 = bitcast float %6 to i32
-  %8 = mul i32 %7, 4
-  %9 = bitcast i32 %8 to float
-  %10 = bitcast float %9 to i32
-  %11 = sub i32 0, %10
-  %12 = bitcast i32 %11 to float
-  %13 = bitcast float %3 to i32
-  %14 = bitcast float %12 to i32
-  %15 = add i32 %13, %14
-  %16 = bitcast i32 %15 to float
-  %17 = load <4 x float>, <4 x float> addrspace(9)* null
-  %18 = extractelement <4 x float> %17, i32 0
-  %19 = load <4 x float>, <4 x float> addrspace(9)* null
-  %20 = extractelement <4 x float> %19, i32 1
-  %21 = load <4 x float>, <4 x float> addrspace(9)* null
-  %22 = extractelement <4 x float> %21, i32 2
+  %tmp = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp5 = extractelement <4 x float> %tmp, i32 3
+  %tmp6 = fptosi float %tmp5 to i32
+  %tmp7 = bitcast i32 %tmp6 to float
+  %tmp8 = bitcast float %tmp7 to i32
+  %tmp9 = sdiv i32 %tmp8, 4
+  %tmp10 = bitcast i32 %tmp9 to float
+  %tmp11 = bitcast float %tmp10 to i32
+  %tmp12 = mul i32 %tmp11, 4
+  %tmp13 = bitcast i32 %tmp12 to float
+  %tmp14 = bitcast float %tmp13 to i32
+  %tmp15 = sub i32 0, %tmp14
+  %tmp16 = bitcast i32 %tmp15 to float
+  %tmp17 = bitcast float %tmp7 to i32
+  %tmp18 = bitcast float %tmp16 to i32
+  %tmp19 = add i32 %tmp17, %tmp18
+  %tmp20 = bitcast i32 %tmp19 to float
+  %tmp21 = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp22 = extractelement <4 x float> %tmp21, i32 0
+  %tmp23 = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp24 = extractelement <4 x float> %tmp23, i32 1
+  %tmp25 = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp26 = extractelement <4 x float> %tmp25, i32 2
   br label %LOOP
 
 LOOP:                                             ; preds = %IF31, %main_body
-  %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ]
-  %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ]
-  %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ]
-  %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ]
-  %23 = bitcast float %temp12.0 to i32
-  %24 = bitcast float %6 to i32
-  %25 = icmp sge i32 %23, %24
-  %26 = sext i1 %25 to i32
-  %27 = bitcast i32 %26 to float
-  %28 = bitcast float %27 to i32
-  %29 = icmp ne i32 %28, 0
-  br i1 %29, label %IF, label %LOOP29
+  %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %tmp47, %IF31 ]
+  %temp6.0 = phi float [ %tmp26, %main_body ], [ %temp6.1, %IF31 ]
+  %temp5.0 = phi float [ %tmp24, %main_body ], [ %temp5.1, %IF31 ]
+  %temp4.0 = phi float [ %tmp22, %main_body ], [ %temp4.1, %IF31 ]
+  %tmp27 = bitcast float %temp12.0 to i32
+  %tmp28 = bitcast float %tmp10 to i32
+  %tmp29 = icmp sge i32 %tmp27, %tmp28
+  %tmp30 = sext i1 %tmp29 to i32
+  %tmp31 = bitcast i32 %tmp30 to float
+  %tmp32 = bitcast float %tmp31 to i32
+  %tmp33 = icmp ne i32 %tmp32, 0
+  br i1 %tmp33, label %IF, label %LOOP29
 
 IF:                                               ; preds = %LOOP
-  %30 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
-  %31 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
-  %32 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %34 = insertelement <4 x float> undef, float %30, i32 0
-  %35 = insertelement <4 x float> %34, float %31, i32 1
-  %36 = insertelement <4 x float> %35, float %32, i32 2
-  %37 = insertelement <4 x float> %36, float %33, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %37, i32 0, i32 0)
+  %max.0.i = call float @llvm.maxnum.f32(float %temp4.0, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %temp5.0, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %temp6.0, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp34 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp35 = insertelement <4 x float> %tmp34, float %clamp.i4, i32 1
+  %tmp36 = insertelement <4 x float> %tmp35, float %clamp.i2, i32 2
+  %tmp37 = insertelement <4 x float> %tmp36, float 1.000000e+00, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp37, i32 0, i32 0)
   ret void
 
-LOOP29:                                           ; preds = %LOOP, %ENDIF30
+LOOP29:                                           ; preds = %ENDIF30, %LOOP
   %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ]
   %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ]
   %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ]
-  %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ]
-  %38 = bitcast float %temp20.0 to i32
-  %39 = bitcast float %16 to i32
-  %40 = icmp sge i32 %38, %39
-  %41 = sext i1 %40 to i32
-  %42 = bitcast i32 %41 to float
-  %43 = bitcast float %42 to i32
-  %44 = icmp ne i32 %43, 0
-  br i1 %44, label %IF31, label %ENDIF30
+  %temp20.0 = phi float [ %tmp50, %ENDIF30 ], [ 0.000000e+00, %LOOP ]
+  %tmp38 = bitcast float %temp20.0 to i32
+  %tmp39 = bitcast float %tmp20 to i32
+  %tmp40 = icmp sge i32 %tmp38, %tmp39
+  %tmp41 = sext i1 %tmp40 to i32
+  %tmp42 = bitcast i32 %tmp41 to float
+  %tmp43 = bitcast float %tmp42 to i32
+  %tmp44 = icmp ne i32 %tmp43, 0
+  br i1 %tmp44, label %IF31, label %ENDIF30
 
 IF31:                                             ; preds = %LOOP29
-  %45 = bitcast float %temp12.0 to i32
-  %46 = add i32 %45, 1
-  %47 = bitcast i32 %46 to float
+  %tmp45 = bitcast float %temp12.0 to i32
+  %tmp46 = add i32 %tmp45, 1
+  %tmp47 = bitcast i32 %tmp46 to float
   br label %LOOP
 
 ENDIF30:                                          ; preds = %LOOP29
-  %48 = bitcast float %temp20.0 to i32
-  %49 = add i32 %48, 1
-  %50 = bitcast i32 %49 to float
+  %tmp48 = bitcast float %temp20.0 to i32
+  %tmp49 = add i32 %tmp48, 1
+  %tmp50 = bitcast i32 %tmp49 to float
   br label %LOOP29
 }
 
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #0
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
 
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
index 5839785f00d5..6cd419f6cfc4 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
@@ -1,55 +1,84 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
+; REQUIRES: asserts
 
-define void @main() {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(9)* null
-  %1 = extractelement <4 x float> %0, i32 3
-  %2 = fptosi float %1 to i32
-  %3 = bitcast i32 %2 to float
-  %4 = load <4 x float>, <4 x float> addrspace(9)* null
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = load <4 x float>, <4 x float> addrspace(9)* null
-  %7 = extractelement <4 x float> %6, i32 1
-  %8 = load <4 x float>, <4 x float> addrspace(9)* null
-  %9 = extractelement <4 x float> %8, i32 2
-  br label %LOOP
+  %tmp = extractelement <4 x float> %reg1, i32 0
+  %tmp5 = extractelement <4 x float> %reg1, i32 1
+  %tmp6 = extractelement <4 x float> %reg1, i32 2
+  %tmp7 = extractelement <4 x float> %reg1, i32 3
+  %tmp8 = fcmp ult float %tmp5, 0.000000e+00
+  %tmp9 = select i1 %tmp8, float 1.000000e+00, float 0.000000e+00
+  %tmp10 = fsub float -0.000000e+00, %tmp9
+  %tmp11 = fptosi float %tmp10 to i32
+  %tmp12 = bitcast i32 %tmp11 to float
+  %tmp13 = fcmp ult float %tmp, 5.700000e+01
+  %tmp14 = select i1 %tmp13, float 1.000000e+00, float 0.000000e+00
+  %tmp15 = fsub float -0.000000e+00, %tmp14
+  %tmp16 = fptosi float %tmp15 to i32
+  %tmp17 = bitcast i32 %tmp16 to float
+  %tmp18 = bitcast float %tmp12 to i32
+  %tmp19 = bitcast float %tmp17 to i32
+  %tmp20 = and i32 %tmp18, %tmp19
+  %tmp21 = bitcast i32 %tmp20 to float
+  %tmp22 = bitcast float %tmp21 to i32
+  %tmp23 = icmp ne i32 %tmp22, 0
+  %tmp24 = fcmp ult float %tmp, 0.000000e+00
+  %tmp25 = select i1 %tmp24, float 1.000000e+00, float 0.000000e+00
+  %tmp26 = fsub float -0.000000e+00, %tmp25
+  %tmp27 = fptosi float %tmp26 to i32
+  %tmp28 = bitcast i32 %tmp27 to float
+  %tmp29 = bitcast float %tmp28 to i32
+  %tmp30 = icmp ne i32 %tmp29, 0
+  br i1 %tmp23, label %IF, label %ELSE
 
-LOOP:                                             ; preds = %ENDIF, %main_body
-  %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ]
-  %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ]
-  %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ]
-  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ]
-  %10 = bitcast float %temp8.0 to i32
-  %11 = bitcast float %3 to i32
-  %12 = icmp sge i32 %10, %11
-  %13 = sext i1 %12 to i32
-  %14 = bitcast i32 %13 to float
-  %15 = bitcast float %14 to i32
-  %16 = icmp ne i32 %15, 0
-  br i1 %16, label %IF, label %ENDIF
+IF:                                               ; preds = %main_body
+  %. = select i1 %tmp30, float 0.000000e+00, float 1.000000e+00
+  %.18 = select i1 %tmp30, float 1.000000e+00, float 0.000000e+00
+  br label %ENDIF
 
-IF:                                               ; preds = %LOOP
-  %17 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
-  %18 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
-  %19 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
-  %20 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %21 = insertelement <4 x float> undef, float %17, i32 0
-  %22 = insertelement <4 x float> %21, float %18, i32 1
-  %23 = insertelement <4 x float> %22, float %19, i32 2
-  %24 = insertelement <4 x float> %23, float %20, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %24, i32 0, i32 0)
+ELSE:                                             ; preds = %main_body
+  br i1 %tmp30, label %ENDIF, label %ELSE17
+
+ENDIF:                                            ; preds = %ELSE17, %ELSE, %IF
+  %temp1.0 = phi float [ %., %IF ], [ %tmp48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %tmp49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
+  %temp.0 = phi float [ %.18, %IF ], [ %tmp47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %max.0.i = call float @llvm.maxnum.f32(float %temp.0, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %temp1.0, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %temp2.0, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp31 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp32 = insertelement <4 x float> %tmp31, float %clamp.i4, i32 1
+  %tmp33 = insertelement <4 x float> %tmp32, float %clamp.i2, i32 2
+  %tmp34 = insertelement <4 x float> %tmp33, float 1.000000e+00, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp34, i32 0, i32 0)
   ret void
 
-ENDIF:                                            ; preds = %LOOP
-  %25 = bitcast float %temp8.0 to i32
-  %26 = add i32 %25, 1
-  %27 = bitcast i32 %26 to float
-  br label %LOOP
+ELSE17:                                           ; preds = %ELSE
+  %tmp35 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp36 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp37 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp38 = fadd float %tmp35, 0x3FC99999A0000000
+  %tmp39 = fadd float %tmp36, 0x3FC99999A0000000
+  %tmp40 = fadd float %tmp37, 0x3FC99999A0000000
+  %tmp41 = fadd float %tmp38, 0x3FC99999A0000000
+  %tmp42 = fadd float %tmp39, 0x3FC99999A0000000
+  %tmp43 = fadd float %tmp40, 0x3FC99999A0000000
+  %tmp44 = fadd float %tmp41, 0x3FC99999A0000000
+  %tmp45 = fadd float %tmp42, 0x3FC99999A0000000
+  %tmp46 = fadd float %tmp43, 0x3FC99999A0000000
+  %tmp47 = fadd float %tmp44, 0x3FC99999A0000000
+  %tmp48 = fadd float %tmp45, 0x3FC99999A0000000
+  %tmp49 = fadd float %tmp46, 0x3FC99999A0000000
+  br label %ENDIF
 }
 
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
-
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #0
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll
index 32c456bd2ceb..44d46086f02a 100644
--- a/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; SI: buffer_store_dword [[REG0]]
 ; SI: buffer_store_dword [[REG1]]
-define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
   %load0 = load i32, i32 addrspace(1)* %ptr, align 4
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2
   %load1 = load i32, i32 addrspace(1)* %gep, align 4
@@ -24,7 +24,7 @@ define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)*
 ; FUNC-LABEL: {{^}}same_base_ptr_crash:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
-define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
   %tmp0 = load i32, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/schedule-if-2.ll b/test/CodeGen/AMDGPU/schedule-if-2.ll
index aa67b2e0f7db..964298a55318 100644
--- a/test/CodeGen/AMDGPU/schedule-if-2.ll
+++ b/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() {
 main_body:
   %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %1 = extractelement <4 x float> %0, i32 0
diff --git a/test/CodeGen/AMDGPU/schedule-if.ll b/test/CodeGen/AMDGPU/schedule-if.ll
index 6637b3897717..feac5d918f63 100644
--- a/test/CodeGen/AMDGPU/schedule-if.ll
+++ b/test/CodeGen/AMDGPU/schedule-if.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() {
 main_body:
   %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %1 = extractelement <4 x float> %0, i32 0
diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index ccfde7b9adc5..5c47c163dcce 100644
--- a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -12,7 +12,7 @@
 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
-define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
+define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
   store i32 %x, i32 addrspace(1)* %out0, align 4
   store i32 %y, i32 addrspace(1)* %out1, align 4
   ret void
@@ -26,7 +26,7 @@ define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1,
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2
 ; GCN: s_endpgm
-define void @same_base_ptr_crash(i64 addrspace(1)* %out,
+define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out,
     i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7,
     i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15,
     i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23,
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
new file mode 100644
index 000000000000..4520fe86136f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
@@ -0,0 +1,591 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
+
+; We expect a two digit VGPR usage here, not a three digit.
+; CHECK: NumVgprs: {{[0-9][0-9]$}}
+
+define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) {
+bb:
+  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
+  %tmp2 = load float, float addrspace(3)* %tmp, align 4
+  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
+  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
+  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
+  %tmp6 = load float, float addrspace(3)* %tmp5, align 4
+  %tmp7 = tail call float @llvm.fmuladd.f32(float %tmp2, float %tmp4, float %tmp6)
+  %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
+  %tmp9 = load float, float addrspace(3)* %tmp8, align 4
+  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
+  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
+  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
+  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp14 = tail call float @llvm.fmuladd.f32(float %tmp9, float %tmp11, float %tmp13)
+  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
+  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
+  %tmp17 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
+  %tmp18 = load float, float addrspace(3)* %tmp17, align 4
+  %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
+  %tmp20 = load float, float addrspace(3)* %tmp19, align 4
+  %tmp21 = tail call float @llvm.fmuladd.f32(float %tmp16, float %tmp18, float %tmp20)
+  %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
+  %tmp23 = load float, float addrspace(3)* %tmp22, align 4
+  %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
+  %tmp25 = load float, float addrspace(3)* %tmp24, align 4
+  %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
+  %tmp27 = load float, float addrspace(3)* %tmp26, align 4
+  %tmp28 = tail call float @llvm.fmuladd.f32(float %tmp23, float %tmp25, float %tmp27)
+  %tmp29 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
+  %tmp30 = load float, float addrspace(3)* %tmp29, align 4
+  %tmp31 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
+  %tmp32 = load float, float addrspace(3)* %tmp31, align 4
+  %tmp33 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
+  %tmp34 = load float, float addrspace(3)* %tmp33, align 4
+  %tmp35 = tail call float @llvm.fmuladd.f32(float %tmp30, float %tmp32, float %tmp34)
+  %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
+  %tmp37 = load float, float addrspace(3)* %tmp36, align 4
+  %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
+  %tmp39 = load float, float addrspace(3)* %tmp38, align 4
+  %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
+  %tmp41 = load float, float addrspace(3)* %tmp40, align 4
+  %tmp42 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41)
+  %tmp43 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
+  %tmp44 = load float, float addrspace(3)* %tmp43, align 4
+  %tmp45 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
+  %tmp46 = load float, float addrspace(3)* %tmp45, align 4
+  %tmp47 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
+  %tmp48 = load float, float addrspace(3)* %tmp47, align 4
+  %tmp49 = tail call float @llvm.fmuladd.f32(float %tmp44, float %tmp46, float %tmp48)
+  %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
+  %tmp51 = load float, float addrspace(3)* %tmp50, align 4
+  %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
+  %tmp53 = load float, float addrspace(3)* %tmp52, align 4
+  %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 31
+  %tmp55 = load float, float addrspace(3)* %tmp54, align 4
+  %tmp56 = tail call float @llvm.fmuladd.f32(float %tmp51, float %tmp53, float %tmp55)
+  %tmp57 = getelementptr inbounds float, float addrspace(3)* %arg, i32 33
+  %tmp58 = load float, float addrspace(3)* %tmp57, align 4
+  %tmp59 = getelementptr inbounds float, float addrspace(3)* %arg, i32 34
+  %tmp60 = load float, float addrspace(3)* %tmp59, align 4
+  %tmp61 = getelementptr inbounds float, float addrspace(3)* %arg, i32 35
+  %tmp62 = load float, float addrspace(3)* %tmp61, align 4
+  %tmp63 = tail call float @llvm.fmuladd.f32(float %tmp58, float %tmp60, float %tmp62)
+  %tmp64 = getelementptr inbounds float, float addrspace(3)* %arg, i32 37
+  %tmp65 = load float, float addrspace(3)* %tmp64, align 4
+  %tmp66 = getelementptr inbounds float, float addrspace(3)* %arg, i32 38
+  %tmp67 = load float, float addrspace(3)* %tmp66, align 4
+  %tmp68 = getelementptr inbounds float, float addrspace(3)* %arg, i32 39
+  %tmp69 = load float, float addrspace(3)* %tmp68, align 4
+  %tmp70 = tail call float @llvm.fmuladd.f32(float %tmp65, float %tmp67, float %tmp69)
+  %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg, i32 41
+  %tmp72 = load float, float addrspace(3)* %tmp71, align 4
+  %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg, i32 42
+  %tmp74 = load float, float addrspace(3)* %tmp73, align 4
+  %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg, i32 43
+  %tmp76 = load float, float addrspace(3)* %tmp75, align 4
+  %tmp77 = tail call float @llvm.fmuladd.f32(float %tmp72, float %tmp74, float %tmp76)
+  %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg, i32 45
+  %tmp79 = load float, float addrspace(3)* %tmp78, align 4
+  %tmp80 = getelementptr inbounds float, float addrspace(3)* %arg, i32 46
+  %tmp81 = load float, float addrspace(3)* %tmp80, align 4
+  %tmp82 = getelementptr inbounds float, float addrspace(3)* %arg, i32 47
+  %tmp83 = load float, float addrspace(3)* %tmp82, align 4
+  %tmp84 = tail call float @llvm.fmuladd.f32(float %tmp79, float %tmp81, float %tmp83)
+  %tmp85 = getelementptr inbounds float, float addrspace(3)* %arg, i32 49
+  %tmp86 = load float, float addrspace(3)* %tmp85, align 4
+  %tmp87 = getelementptr inbounds float, float addrspace(3)* %arg, i32 50
+  %tmp88 = load float, float addrspace(3)* %tmp87, align 4
+  %tmp89 = getelementptr inbounds float, float addrspace(3)* %arg, i32 51
+  %tmp90 = load float, float addrspace(3)* %tmp89, align 4
+  %tmp91 = tail call float @llvm.fmuladd.f32(float %tmp86, float %tmp88, float %tmp90)
+  %tmp92 = getelementptr inbounds float, float addrspace(3)* %arg, i32 53
+  %tmp93 = load float, float addrspace(3)* %tmp92, align 4
+  %tmp94 = getelementptr inbounds float, float addrspace(3)* %arg, i32 54
+  %tmp95 = load float, float addrspace(3)* %tmp94, align 4
+  %tmp96 = getelementptr inbounds float, float addrspace(3)* %arg, i32 55
+  %tmp97 = load float, float addrspace(3)* %tmp96, align 4
+  %tmp98 = tail call float @llvm.fmuladd.f32(float %tmp93, float %tmp95, float %tmp97)
+  %tmp99 = getelementptr inbounds float, float addrspace(3)* %arg, i32 57
+  %tmp100 = load float, float addrspace(3)* %tmp99, align 4
+  %tmp101 = getelementptr inbounds float, float addrspace(3)* %arg, i32 58
+  %tmp102 = load float, float addrspace(3)* %tmp101, align 4
+  %tmp103 = getelementptr inbounds float, float addrspace(3)* %arg, i32 59
+  %tmp104 = load float, float addrspace(3)* %tmp103, align 4
+  %tmp105 = tail call float @llvm.fmuladd.f32(float %tmp100, float %tmp102, float %tmp104)
+  %tmp106 = getelementptr inbounds float, float addrspace(3)* %arg, i32 61
+  %tmp107 = load float, float addrspace(3)* %tmp106, align 4
+  %tmp108 = getelementptr inbounds float, float addrspace(3)* %arg, i32 62
+  %tmp109 = load float, float addrspace(3)* %tmp108, align 4
+  %tmp110 = getelementptr inbounds float, float addrspace(3)* %arg, i32 63
+  %tmp111 = load float, float addrspace(3)* %tmp110, align 4
+  %tmp112 = tail call float @llvm.fmuladd.f32(float %tmp107, float %tmp109, float %tmp111)
+  %tmp113 = getelementptr inbounds float, float addrspace(3)* %arg, i32 65
+  %tmp114 = load float, float addrspace(3)* %tmp113, align 4
+  %tmp115 = getelementptr inbounds float, float addrspace(3)* %arg, i32 66
+  %tmp116 = load float, float addrspace(3)* %tmp115, align 4
+  %tmp117 = getelementptr inbounds float, float addrspace(3)* %arg, i32 67
+  %tmp118 = load float, float addrspace(3)* %tmp117, align 4
+  %tmp119 = tail call float @llvm.fmuladd.f32(float %tmp114, float %tmp116, float %tmp118)
+  %tmp120 = getelementptr inbounds float, float addrspace(3)* %arg, i32 69
+  %tmp121 = load float, float addrspace(3)* %tmp120, align 4
+  %tmp122 = getelementptr inbounds float, float addrspace(3)* %arg, i32 70
+  %tmp123 = load float, float addrspace(3)* %tmp122, align 4
+  %tmp124 = getelementptr inbounds float, float addrspace(3)* %arg, i32 71
+  %tmp125 = load float, float addrspace(3)* %tmp124, align 4
+  %tmp126 = tail call float @llvm.fmuladd.f32(float %tmp121, float %tmp123, float %tmp125)
+  %tmp127 = getelementptr inbounds float, float addrspace(3)* %arg, i32 73
+  %tmp128 = load float, float addrspace(3)* %tmp127, align 4
+  %tmp129 = getelementptr inbounds float, float addrspace(3)* %arg, i32 74
+  %tmp130 = load float, float addrspace(3)* %tmp129, align 4
+  %tmp131 = getelementptr inbounds float, float addrspace(3)* %arg, i32 75
+  %tmp132 = load float, float addrspace(3)* %tmp131, align 4
+  %tmp133 = tail call float @llvm.fmuladd.f32(float %tmp128, float %tmp130, float %tmp132)
+  %tmp134 = getelementptr inbounds float, float addrspace(3)* %arg, i32 77
+  %tmp135 = load float, float addrspace(3)* %tmp134, align 4
+  %tmp136 = getelementptr inbounds float, float addrspace(3)* %arg, i32 78
+  %tmp137 = load float, float addrspace(3)* %tmp136, align 4
+  %tmp138 = getelementptr inbounds float, float addrspace(3)* %arg, i32 79
+  %tmp139 = load float, float addrspace(3)* %tmp138, align 4
+  %tmp140 = tail call float @llvm.fmuladd.f32(float %tmp135, float %tmp137, float %tmp139)
+  %tmp141 = getelementptr inbounds float, float addrspace(3)* %arg, i32 81
+  %tmp142 = load float, float addrspace(3)* %tmp141, align 4
+  %tmp143 = getelementptr inbounds float, float addrspace(3)* %arg, i32 82
+  %tmp144 = load float, float addrspace(3)* %tmp143, align 4
+  %tmp145 = getelementptr inbounds float, float addrspace(3)* %arg, i32 83
+  %tmp146 = load float, float addrspace(3)* %tmp145, align 4
+  %tmp147 = tail call float @llvm.fmuladd.f32(float %tmp142, float %tmp144, float %tmp146)
+  %tmp148 = getelementptr inbounds float, float addrspace(3)* %arg, i32 85
+  %tmp149 = load float, float addrspace(3)* %tmp148, align 4
+  %tmp150 = getelementptr inbounds float, float addrspace(3)* %arg, i32 86
+  %tmp151 = load float, float addrspace(3)* %tmp150, align 4
+  %tmp152 = getelementptr inbounds float, float addrspace(3)* %arg, i32 87
+  %tmp153 = load float, float addrspace(3)* %tmp152, align 4
+  %tmp154 = tail call float @llvm.fmuladd.f32(float %tmp149, float %tmp151, float %tmp153)
+  %tmp155 = getelementptr inbounds float, float addrspace(3)* %arg, i32 89
+  %tmp156 = load float, float addrspace(3)* %tmp155, align 4
+  %tmp157 = getelementptr inbounds float, float addrspace(3)* %arg, i32 90
+  %tmp158 = load float, float addrspace(3)* %tmp157, align 4
+  %tmp159 = getelementptr inbounds float, float addrspace(3)* %arg, i32 91
+  %tmp160 = load float, float addrspace(3)* %tmp159, align 4
+  %tmp161 = tail call float @llvm.fmuladd.f32(float %tmp156, float %tmp158, float %tmp160)
+  %tmp162 = getelementptr inbounds float, float addrspace(3)* %arg, i32 93
+  %tmp163 = load float, float addrspace(3)* %tmp162, align 4
+  %tmp164 = getelementptr inbounds float, float addrspace(3)* %arg, i32 94
+  %tmp165 = load float, float addrspace(3)* %tmp164, align 4
+  %tmp166 = getelementptr inbounds float, float addrspace(3)* %arg, i32 95
+  %tmp167 = load float, float addrspace(3)* %tmp166, align 4
+  %tmp168 = tail call float @llvm.fmuladd.f32(float %tmp163, float %tmp165, float %tmp167)
+  %tmp169 = getelementptr inbounds float, float addrspace(3)* %arg, i32 97
+  %tmp170 = load float, float addrspace(3)* %tmp169, align 4
+  %tmp171 = getelementptr inbounds float, float addrspace(3)* %arg, i32 98
+  %tmp172 = load float, float addrspace(3)* %tmp171, align 4
+  %tmp173 = getelementptr inbounds float, float addrspace(3)* %arg, i32 99
+  %tmp174 = load float, float addrspace(3)* %tmp173, align 4
+  %tmp175 = tail call float @llvm.fmuladd.f32(float %tmp170, float %tmp172, float %tmp174)
+  %tmp176 = getelementptr inbounds float, float addrspace(3)* %arg, i32 101
+  %tmp177 = load float, float addrspace(3)* %tmp176, align 4
+  %tmp178 = getelementptr inbounds float, float addrspace(3)* %arg, i32 102
+  %tmp179 = load float, float addrspace(3)* %tmp178, align 4
+  %tmp180 = getelementptr inbounds float, float addrspace(3)* %arg, i32 103
+  %tmp181 = load float, float addrspace(3)* %tmp180, align 4
+  %tmp182 = tail call float @llvm.fmuladd.f32(float %tmp177, float %tmp179, float %tmp181)
+  %tmp183 = getelementptr inbounds float, float addrspace(3)* %arg, i32 105
+  %tmp184 = load float, float addrspace(3)* %tmp183, align 4
+  %tmp185 = getelementptr inbounds float, float addrspace(3)* %arg, i32 106
+  %tmp186 = load float, float addrspace(3)* %tmp185, align 4
+  %tmp187 = getelementptr inbounds float, float addrspace(3)* %arg, i32 107
+  %tmp188 = load float, float addrspace(3)* %tmp187, align 4
+  %tmp189 = tail call float @llvm.fmuladd.f32(float %tmp184, float %tmp186, float %tmp188)
+  %tmp190 = getelementptr inbounds float, float addrspace(3)* %arg, i32 109
+  %tmp191 = load float, float addrspace(3)* %tmp190, align 4
+  %tmp192 = getelementptr inbounds float, float addrspace(3)* %arg, i32 110
+  %tmp193 = load float, float addrspace(3)* %tmp192, align 4
+  %tmp194 = getelementptr inbounds float, float addrspace(3)* %arg, i32 111
+  %tmp195 = load float, float addrspace(3)* %tmp194, align 4
+  %tmp196 = tail call float @llvm.fmuladd.f32(float %tmp191, float %tmp193, float %tmp195)
+  %tmp197 = getelementptr inbounds float, float addrspace(3)* %arg, i32 113
+  %tmp198 = load float, float addrspace(3)* %tmp197, align 4
+  %tmp199 = getelementptr inbounds float, float addrspace(3)* %arg, i32 114
+  %tmp200 = load float, float addrspace(3)* %tmp199, align 4
+  %tmp201 = getelementptr inbounds float, float addrspace(3)* %arg, i32 115
+  %tmp202 = load float, float addrspace(3)* %tmp201, align 4
+  %tmp203 = tail call float @llvm.fmuladd.f32(float %tmp198, float %tmp200, float %tmp202)
+  %tmp204 = getelementptr inbounds float, float addrspace(3)* %arg, i32 117
+  %tmp205 = load float, float addrspace(3)* %tmp204, align 4
+  %tmp206 = getelementptr inbounds float, float addrspace(3)* %arg, i32 118
+  %tmp207 = load float, float addrspace(3)* %tmp206, align 4
+  %tmp208 = getelementptr inbounds float, float addrspace(3)* %arg, i32 119
+  %tmp209 = load float, float addrspace(3)* %tmp208, align 4
+  %tmp210 = tail call float @llvm.fmuladd.f32(float %tmp205, float %tmp207, float %tmp209)
+  %tmp211 = getelementptr inbounds float, float addrspace(3)* %arg, i32 121
+  %tmp212 = load float, float addrspace(3)* %tmp211, align 4
+  %tmp213 = getelementptr inbounds float, float addrspace(3)* %arg, i32 122
+  %tmp214 = load float, float addrspace(3)* %tmp213, align 4
+  %tmp215 = getelementptr inbounds float, float addrspace(3)* %arg, i32 123
+  %tmp216 = load float, float addrspace(3)* %tmp215, align 4
+  %tmp217 = tail call float @llvm.fmuladd.f32(float %tmp212, float %tmp214, float %tmp216)
+  %tmp218 = getelementptr inbounds float, float addrspace(3)* %arg, i32 125
+  %tmp219 = load float, float addrspace(3)* %tmp218, align 4
+  %tmp220 = getelementptr inbounds float, float addrspace(3)* %arg, i32 126
+  %tmp221 = load float, float addrspace(3)* %tmp220, align 4
+  %tmp222 = getelementptr inbounds float, float addrspace(3)* %arg, i32 127
+  %tmp223 = load float, float addrspace(3)* %tmp222, align 4
+  %tmp224 = tail call float @llvm.fmuladd.f32(float %tmp219, float %tmp221, float %tmp223)
+  %tmp225 = getelementptr inbounds float, float addrspace(3)* %arg, i32 129
+  %tmp226 = load float, float addrspace(3)* %tmp225, align 4
+  %tmp227 = getelementptr inbounds float, float addrspace(3)* %arg, i32 130
+  %tmp228 = load float, float addrspace(3)* %tmp227, align 4
+  %tmp229 = getelementptr inbounds float, float addrspace(3)* %arg, i32 131
+  %tmp230 = load float, float addrspace(3)* %tmp229, align 4
+  %tmp231 = tail call float @llvm.fmuladd.f32(float %tmp226, float %tmp228, float %tmp230)
+  %tmp232 = getelementptr inbounds float, float addrspace(3)* %arg, i32 133
+  %tmp233 = load float, float addrspace(3)* %tmp232, align 4
+  %tmp234 = getelementptr inbounds float, float addrspace(3)* %arg, i32 134
+  %tmp235 = load float, float addrspace(3)* %tmp234, align 4
+  %tmp236 = getelementptr inbounds float, float addrspace(3)* %arg, i32 135
+  %tmp237 = load float, float addrspace(3)* %tmp236, align 4
+  %tmp238 = tail call float @llvm.fmuladd.f32(float %tmp233, float %tmp235, float %tmp237)
+  %tmp239 = getelementptr inbounds float, float addrspace(3)* %arg, i32 137
+  %tmp240 = load float, float addrspace(3)* %tmp239, align 4
+  %tmp241 = getelementptr inbounds float, float addrspace(3)* %arg, i32 138
+  %tmp242 = load float, float addrspace(3)* %tmp241, align 4
+  %tmp243 = getelementptr inbounds float, float addrspace(3)* %arg, i32 139
+  %tmp244 = load float, float addrspace(3)* %tmp243, align 4
+  %tmp245 = tail call float @llvm.fmuladd.f32(float %tmp240, float %tmp242, float %tmp244)
+  %tmp246 = getelementptr inbounds float, float addrspace(3)* %arg, i32 141
+  %tmp247 = load float, float addrspace(3)* %tmp246, align 4
+  %tmp248 = getelementptr inbounds float, float addrspace(3)* %arg, i32 142
+  %tmp249 = load float, float addrspace(3)* %tmp248, align 4
+  %tmp250 = getelementptr inbounds float, float addrspace(3)* %arg, i32 143
+  %tmp251 = load float, float addrspace(3)* %tmp250, align 4
+  %tmp252 = tail call float @llvm.fmuladd.f32(float %tmp247, float %tmp249, float %tmp251)
+  %tmp253 = getelementptr inbounds float, float addrspace(3)* %arg, i32 145
+  %tmp254 = load float, float addrspace(3)* %tmp253, align 4
+  %tmp255 = getelementptr inbounds float, float addrspace(3)* %arg, i32 146
+  %tmp256 = load float, float addrspace(3)* %tmp255, align 4
+  %tmp257 = getelementptr inbounds float, float addrspace(3)* %arg, i32 147
+  %tmp258 = load float, float addrspace(3)* %tmp257, align 4
+  %tmp259 = tail call float @llvm.fmuladd.f32(float %tmp254, float %tmp256, float %tmp258)
+  %tmp260 = getelementptr inbounds float, float addrspace(3)* %arg, i32 149
+  %tmp261 = load float, float addrspace(3)* %tmp260, align 4
+  %tmp262 = getelementptr inbounds float, float addrspace(3)* %arg, i32 150
+  %tmp263 = load float, float addrspace(3)* %tmp262, align 4
+  %tmp264 = getelementptr inbounds float, float addrspace(3)* %arg, i32 151
+  %tmp265 = load float, float addrspace(3)* %tmp264, align 4
+  %tmp266 = tail call float @llvm.fmuladd.f32(float %tmp261, float %tmp263, float %tmp265)
+  %tmp267 = getelementptr inbounds float, float addrspace(3)* %arg, i32 153
+  %tmp268 = load float, float addrspace(3)* %tmp267, align 4
+  %tmp269 = getelementptr inbounds float, float addrspace(3)* %arg, i32 154
+  %tmp270 = load float, float addrspace(3)* %tmp269, align 4
+  %tmp271 = getelementptr inbounds float, float addrspace(3)* %arg, i32 155
+  %tmp272 = load float, float addrspace(3)* %tmp271, align 4
+  %tmp273 = tail call float @llvm.fmuladd.f32(float %tmp268, float %tmp270, float %tmp272)
+  %tmp274 = getelementptr inbounds float, float addrspace(3)* %arg, i32 157
+  %tmp275 = load float, float addrspace(3)* %tmp274, align 4
+  %tmp276 = getelementptr inbounds float, float addrspace(3)* %arg, i32 158
+  %tmp277 = load float, float addrspace(3)* %tmp276, align 4
+  %tmp278 = getelementptr inbounds float, float addrspace(3)* %arg, i32 159
+  %tmp279 = load float, float addrspace(3)* %tmp278, align 4
+  %tmp280 = tail call float @llvm.fmuladd.f32(float %tmp275, float %tmp277, float %tmp279)
+  %tmp281 = getelementptr inbounds float, float addrspace(3)* %arg, i32 161
+  %tmp282 = load float, float addrspace(3)* %tmp281, align 4
+  %tmp283 = getelementptr inbounds float, float addrspace(3)* %arg, i32 162
+  %tmp284 = load float, float addrspace(3)* %tmp283, align 4
+  %tmp285 = getelementptr inbounds float, float addrspace(3)* %arg, i32 163
+  %tmp286 = load float, float addrspace(3)* %tmp285, align 4
+  %tmp287 = tail call float @llvm.fmuladd.f32(float %tmp282, float %tmp284, float %tmp286)
+  %tmp288 = getelementptr inbounds float, float addrspace(3)* %arg, i32 165
+  %tmp289 = load float, float addrspace(3)* %tmp288, align 4
+  %tmp290 = getelementptr inbounds float, float addrspace(3)* %arg, i32 166
+  %tmp291 = load float, float addrspace(3)* %tmp290, align 4
+  %tmp292 = getelementptr inbounds float, float addrspace(3)* %arg, i32 167
+  %tmp293 = load float, float addrspace(3)* %tmp292, align 4
+  %tmp294 = tail call float @llvm.fmuladd.f32(float %tmp289, float %tmp291, float %tmp293)
+  %tmp295 = getelementptr inbounds float, float addrspace(3)* %arg, i32 169
+  %tmp296 = load float, float addrspace(3)* %tmp295, align 4
+  %tmp297 = getelementptr inbounds float, float addrspace(3)* %arg, i32 170
+  %tmp298 = load float, float addrspace(3)* %tmp297, align 4
+  %tmp299 = getelementptr inbounds float, float addrspace(3)* %arg, i32 171
+  %tmp300 = load float, float addrspace(3)* %tmp299, align 4
+  %tmp301 = tail call float @llvm.fmuladd.f32(float %tmp296, float %tmp298, float %tmp300)
+  %tmp302 = getelementptr inbounds float, float addrspace(3)* %arg, i32 173
+  %tmp303 = load float, float addrspace(3)* %tmp302, align 4
+  %tmp304 = getelementptr inbounds float, float addrspace(3)* %arg, i32 174
+  %tmp305 = load float, float addrspace(3)* %tmp304, align 4
+  %tmp306 = getelementptr inbounds float, float addrspace(3)* %arg, i32 175
+  %tmp307 = load float, float addrspace(3)* %tmp306, align 4
+  %tmp308 = tail call float @llvm.fmuladd.f32(float %tmp303, float %tmp305, float %tmp307)
+  %tmp309 = getelementptr inbounds float, float addrspace(3)* %arg, i32 177
+  %tmp310 = load float, float addrspace(3)* %tmp309, align 4
+  %tmp311 = getelementptr inbounds float, float addrspace(3)* %arg, i32 178
+  %tmp312 = load float, float addrspace(3)* %tmp311, align 4
+  %tmp313 = getelementptr inbounds float, float addrspace(3)* %arg, i32 179
+  %tmp314 = load float, float addrspace(3)* %tmp313, align 4
+  %tmp315 = tail call float @llvm.fmuladd.f32(float %tmp310, float %tmp312, float %tmp314)
+  %tmp316 = getelementptr inbounds float, float addrspace(3)* %arg, i32 181
+  %tmp317 = load float, float addrspace(3)* %tmp316, align 4
+  %tmp318 = getelementptr inbounds float, float addrspace(3)* %arg, i32 182
+  %tmp319 = load float, float addrspace(3)* %tmp318, align 4
+  %tmp320 = getelementptr inbounds float, float addrspace(3)* %arg, i32 183
+  %tmp321 = load float, float addrspace(3)* %tmp320, align 4
+  %tmp322 = tail call float @llvm.fmuladd.f32(float %tmp317, float %tmp319, float %tmp321)
+  %tmp323 = getelementptr inbounds float, float addrspace(3)* %arg, i32 185
+  %tmp324 = load float, float addrspace(3)* %tmp323, align 4
+  %tmp325 = getelementptr inbounds float, float addrspace(3)* %arg, i32 186
+  %tmp326 = load float, float addrspace(3)* %tmp325, align 4
+  %tmp327 = getelementptr inbounds float, float addrspace(3)* %arg, i32 187
+  %tmp328 = load float, float addrspace(3)* %tmp327, align 4
+  %tmp329 = tail call float @llvm.fmuladd.f32(float %tmp324, float %tmp326, float %tmp328)
+  %tmp330 = getelementptr inbounds float, float addrspace(3)* %arg, i32 189
+  %tmp331 = load float, float addrspace(3)* %tmp330, align 4
+  %tmp332 = getelementptr inbounds float, float addrspace(3)* %arg, i32 190
+  %tmp333 = load float, float addrspace(3)* %tmp332, align 4
+  %tmp334 = getelementptr inbounds float, float addrspace(3)* %arg, i32 191
+  %tmp335 = load float, float addrspace(3)* %tmp334, align 4
+  %tmp336 = tail call float @llvm.fmuladd.f32(float %tmp331, float %tmp333, float %tmp335)
+  %tmp337 = getelementptr inbounds float, float addrspace(3)* %arg, i32 193
+  %tmp338 = load float, float addrspace(3)* %tmp337, align 4
+  %tmp339 = getelementptr inbounds float, float addrspace(3)* %arg, i32 194
+  %tmp340 = load float, float addrspace(3)* %tmp339, align 4
+  %tmp341 = getelementptr inbounds float, float addrspace(3)* %arg, i32 195
+  %tmp342 = load float, float addrspace(3)* %tmp341, align 4
+  %tmp343 = tail call float @llvm.fmuladd.f32(float %tmp338, float %tmp340, float %tmp342)
+  %tmp344 = getelementptr inbounds float, float addrspace(3)* %arg, i32 197
+  %tmp345 = load float, float addrspace(3)* %tmp344, align 4
+  %tmp346 = getelementptr inbounds float, float addrspace(3)* %arg, i32 198
+  %tmp347 = load float, float addrspace(3)* %tmp346, align 4
+  %tmp348 = getelementptr inbounds float, float addrspace(3)* %arg, i32 199
+  %tmp349 = load float, float addrspace(3)* %tmp348, align 4
+  %tmp350 = tail call float @llvm.fmuladd.f32(float %tmp345, float %tmp347, float %tmp349)
+  %tmp351 = getelementptr inbounds float, float addrspace(3)* %arg, i32 201
+  %tmp352 = load float, float addrspace(3)* %tmp351, align 4
+  %tmp353 = getelementptr inbounds float, float addrspace(3)* %arg, i32 202
+  %tmp354 = load float, float addrspace(3)* %tmp353, align 4
+  %tmp355 = getelementptr inbounds float, float addrspace(3)* %arg, i32 203
+  %tmp356 = load float, float addrspace(3)* %tmp355, align 4
+  %tmp357 = tail call float @llvm.fmuladd.f32(float %tmp352, float %tmp354, float %tmp356)
+  %tmp358 = getelementptr inbounds float, float addrspace(3)* %arg, i32 205
+  %tmp359 = load float, float addrspace(3)* %tmp358, align 4
+  %tmp360 = getelementptr inbounds float, float addrspace(3)* %arg, i32 206
+  %tmp361 = load float, float addrspace(3)* %tmp360, align 4
+  %tmp362 = getelementptr inbounds float, float addrspace(3)* %arg, i32 207
+  %tmp363 = load float, float addrspace(3)* %tmp362, align 4
+  %tmp364 = tail call float @llvm.fmuladd.f32(float %tmp359, float %tmp361, float %tmp363)
+  %tmp365 = getelementptr inbounds float, float addrspace(3)* %arg, i32 209
+  %tmp366 = load float, float addrspace(3)* %tmp365, align 4
+  %tmp367 = getelementptr inbounds float, float addrspace(3)* %arg, i32 210
+  %tmp368 = load float, float addrspace(3)* %tmp367, align 4
+  %tmp369 = getelementptr inbounds float, float addrspace(3)* %arg, i32 211
+  %tmp370 = load float, float addrspace(3)* %tmp369, align 4
+  %tmp371 = tail call float @llvm.fmuladd.f32(float %tmp366, float %tmp368, float %tmp370)
+  %tmp372 = getelementptr inbounds float, float addrspace(3)* %arg, i32 213
+  %tmp373 = load float, float addrspace(3)* %tmp372, align 4
+  %tmp374 = getelementptr inbounds float, float addrspace(3)* %arg, i32 214
+  %tmp375 = load float, float addrspace(3)* %tmp374, align 4
+  %tmp376 = getelementptr inbounds float, float addrspace(3)* %arg, i32 215
+  %tmp377 = load float, float addrspace(3)* %tmp376, align 4
+  %tmp378 = tail call float @llvm.fmuladd.f32(float %tmp373, float %tmp375, float %tmp377)
+  %tmp379 = getelementptr inbounds float, float addrspace(3)* %arg, i32 217
+  %tmp380 = load float, float addrspace(3)* %tmp379, align 4
+  %tmp381 = getelementptr inbounds float, float addrspace(3)* %arg, i32 218
+  %tmp382 = load float, float addrspace(3)* %tmp381, align 4
+  %tmp383 = getelementptr inbounds float, float addrspace(3)* %arg, i32 219
+  %tmp384 = load float, float addrspace(3)* %tmp383, align 4
+  %tmp385 = tail call float @llvm.fmuladd.f32(float %tmp380, float %tmp382, float %tmp384)
+  %tmp386 = getelementptr inbounds float, float addrspace(3)* %arg, i32 221
+  %tmp387 = load float, float addrspace(3)* %tmp386, align 4
+  %tmp388 = getelementptr inbounds float, float addrspace(3)* %arg, i32 222
+  %tmp389 = load float, float addrspace(3)* %tmp388, align 4
+  %tmp390 = getelementptr inbounds float, float addrspace(3)* %arg, i32 223
+  %tmp391 = load float, float addrspace(3)* %tmp390, align 4
+  %tmp392 = tail call float @llvm.fmuladd.f32(float %tmp387, float %tmp389, float %tmp391)
+  %tmp393 = getelementptr inbounds float, float addrspace(3)* %arg, i32 225
+  %tmp394 = load float, float addrspace(3)* %tmp393, align 4
+  %tmp395 = getelementptr inbounds float, float addrspace(3)* %arg, i32 226
+  %tmp396 = load float, float addrspace(3)* %tmp395, align 4
+  %tmp397 = getelementptr inbounds float, float addrspace(3)* %arg, i32 227
+  %tmp398 = load float, float addrspace(3)* %tmp397, align 4
+  %tmp399 = tail call float @llvm.fmuladd.f32(float %tmp394, float %tmp396, float %tmp398)
+  %tmp400 = getelementptr inbounds float, float addrspace(3)* %arg, i32 229
+  %tmp401 = load float, float addrspace(3)* %tmp400, align 4
+  %tmp402 = getelementptr inbounds float, float addrspace(3)* %arg, i32 230
+  %tmp403 = load float, float addrspace(3)* %tmp402, align 4
+  %tmp404 = getelementptr inbounds float, float addrspace(3)* %arg, i32 231
+  %tmp405 = load float, float addrspace(3)* %tmp404, align 4
+  %tmp406 = tail call float @llvm.fmuladd.f32(float %tmp401, float %tmp403, float %tmp405)
+  %tmp407 = getelementptr inbounds float, float addrspace(3)* %arg, i32 233
+  %tmp408 = load float, float addrspace(3)* %tmp407, align 4
+  %tmp409 = getelementptr inbounds float, float addrspace(3)* %arg, i32 234
+  %tmp410 = load float, float addrspace(3)* %tmp409, align 4
+  %tmp411 = getelementptr inbounds float, float addrspace(3)* %arg, i32 235
+  %tmp412 = load float, float addrspace(3)* %tmp411, align 4
+  %tmp413 = tail call float @llvm.fmuladd.f32(float %tmp408, float %tmp410, float %tmp412)
+  %tmp414 = getelementptr inbounds float, float addrspace(3)* %arg, i32 237
+  %tmp415 = load float, float addrspace(3)* %tmp414, align 4
+  %tmp416 = getelementptr inbounds float, float addrspace(3)* %arg, i32 238
+  %tmp417 = load float, float addrspace(3)* %tmp416, align 4
+  %tmp418 = getelementptr inbounds float, float addrspace(3)* %arg, i32 239
+  %tmp419 = load float, float addrspace(3)* %tmp418, align 4
+  %tmp420 = tail call float @llvm.fmuladd.f32(float %tmp415, float %tmp417, float %tmp419)
+  %tmp421 = getelementptr inbounds float, float addrspace(3)* %arg, i32 241
+  %tmp422 = load float, float addrspace(3)* %tmp421, align 4
+  %tmp423 = getelementptr inbounds float, float addrspace(3)* %arg, i32 242
+  %tmp424 = load float, float addrspace(3)* %tmp423, align 4
+  %tmp425 = getelementptr inbounds float, float addrspace(3)* %arg, i32 243
+  %tmp426 = load float, float addrspace(3)* %tmp425, align 4
+  %tmp427 = tail call float @llvm.fmuladd.f32(float %tmp422, float %tmp424, float %tmp426)
+  %tmp428 = getelementptr inbounds float, float addrspace(3)* %arg, i32 245
+  %tmp429 = load float, float addrspace(3)* %tmp428, align 4
+  %tmp430 = getelementptr inbounds float, float addrspace(3)* %arg, i32 246
+  %tmp431 = load float, float addrspace(3)* %tmp430, align 4
+  %tmp432 = getelementptr inbounds float, float addrspace(3)* %arg, i32 247
+  %tmp433 = load float, float addrspace(3)* %tmp432, align 4
+  %tmp434 = tail call float @llvm.fmuladd.f32(float %tmp429, float %tmp431, float %tmp433)
+  %tmp435 = getelementptr inbounds float, float addrspace(3)* %arg, i32 249
+  %tmp436 = load float, float addrspace(3)* %tmp435, align 4
+  %tmp437 = getelementptr inbounds float, float addrspace(3)* %arg, i32 250
+  %tmp438 = load float, float addrspace(3)* %tmp437, align 4
+  %tmp439 = getelementptr inbounds float, float addrspace(3)* %arg, i32 251
+  %tmp440 = load float, float addrspace(3)* %tmp439, align 4
+  %tmp441 = tail call float @llvm.fmuladd.f32(float %tmp436, float %tmp438, float %tmp440)
+  %tmp442 = getelementptr inbounds float, float addrspace(3)* %arg, i32 253
+  %tmp443 = load float, float addrspace(3)* %tmp442, align 4
+  %tmp444 = getelementptr inbounds float, float addrspace(3)* %arg, i32 254
+  %tmp445 = load float, float addrspace(3)* %tmp444, align 4
+  %tmp446 = getelementptr inbounds float, float addrspace(3)* %arg, i32 255
+  %tmp447 = load float, float addrspace(3)* %tmp446, align 4
+  %tmp448 = tail call float @llvm.fmuladd.f32(float %tmp443, float %tmp445, float %tmp447)
+  store float %tmp7, float addrspace(1)* %arg1, align 4
+  %tmp449 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 1
+  store float %tmp14, float addrspace(1)* %tmp449, align 4
+  %tmp450 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 2
+  store float %tmp21, float addrspace(1)* %tmp450, align 4
+  %tmp451 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 3
+  store float %tmp28, float addrspace(1)* %tmp451, align 4
+  %tmp452 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 4
+  store float %tmp35, float addrspace(1)* %tmp452, align 4
+  %tmp453 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 5
+  store float %tmp42, float addrspace(1)* %tmp453, align 4
+  %tmp454 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 6
+  store float %tmp49, float addrspace(1)* %tmp454, align 4
+  %tmp455 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 7
+  store float %tmp56, float addrspace(1)* %tmp455, align 4
+  %tmp456 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 8
+  store float %tmp63, float addrspace(1)* %tmp456, align 4
+  %tmp457 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 9
+  store float %tmp70, float addrspace(1)* %tmp457, align 4
+  %tmp458 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 10
+  store float %tmp77, float addrspace(1)* %tmp458, align 4
+  %tmp459 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 11
+  store float %tmp84, float addrspace(1)* %tmp459, align 4
+  %tmp460 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 12
+  store float %tmp91, float addrspace(1)* %tmp460, align 4
+  %tmp461 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 13
+  store float %tmp98, float addrspace(1)* %tmp461, align 4
+  %tmp462 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 14
+  store float %tmp105, float addrspace(1)* %tmp462, align 4
+  %tmp463 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 15
+  store float %tmp112, float addrspace(1)* %tmp463, align 4
+  %tmp464 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 16
+  store float %tmp119, float addrspace(1)* %tmp464, align 4
+  %tmp465 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 17
+  store float %tmp126, float addrspace(1)* %tmp465, align 4
+  %tmp466 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 18
+  store float %tmp133, float addrspace(1)* %tmp466, align 4
+  %tmp467 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 19
+  store float %tmp140, float addrspace(1)* %tmp467, align 4
+  %tmp468 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 20
+  store float %tmp147, float addrspace(1)* %tmp468, align 4
+  %tmp469 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 21
+  store float %tmp154, float addrspace(1)* %tmp469, align 4
+  %tmp470 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 22
+  store float %tmp161, float addrspace(1)* %tmp470, align 4
+  %tmp471 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 23
+  store float %tmp168, float addrspace(1)* %tmp471, align 4
+  %tmp472 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 24
+  store float %tmp175, float addrspace(1)* %tmp472, align 4
+  %tmp473 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 25
+  store float %tmp182, float addrspace(1)* %tmp473, align 4
+  %tmp474 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 26
+  store float %tmp189, float addrspace(1)* %tmp474, align 4
+  %tmp475 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 27
+  store float %tmp196, float addrspace(1)* %tmp475, align 4
+  %tmp476 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 28
+  store float %tmp203, float addrspace(1)* %tmp476, align 4
+  %tmp477 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 29
+  store float %tmp210, float addrspace(1)* %tmp477, align 4
+  %tmp478 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 30
+  store float %tmp217, float addrspace(1)* %tmp478, align 4
+  %tmp479 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 31
+  store float %tmp224, float addrspace(1)* %tmp479, align 4
+  %tmp480 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 32
+  store float %tmp231, float addrspace(1)* %tmp480, align 4
+  %tmp481 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 33
+  store float %tmp238, float addrspace(1)* %tmp481, align 4
+  %tmp482 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 34
+  store float %tmp245, float addrspace(1)* %tmp482, align 4
+  %tmp483 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 35
+  store float %tmp252, float addrspace(1)* %tmp483, align 4
+  %tmp484 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 36
+  store float %tmp259, float addrspace(1)* %tmp484, align 4
+  %tmp485 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 37
+  store float %tmp266, float addrspace(1)* %tmp485, align 4
+  %tmp486 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 38
+  store float %tmp273, float addrspace(1)* %tmp486, align 4
+  %tmp487 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 39
+  store float %tmp280, float addrspace(1)* %tmp487, align 4
+  %tmp488 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 40
+  store float %tmp287, float addrspace(1)* %tmp488, align 4
+  %tmp489 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 41
+  store float %tmp294, float addrspace(1)* %tmp489, align 4
+  %tmp490 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 42
+  store float %tmp301, float addrspace(1)* %tmp490, align 4
+  %tmp491 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 43
+  store float %tmp308, float addrspace(1)* %tmp491, align 4
+  %tmp492 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 44
+  store float %tmp315, float addrspace(1)* %tmp492, align 4
+  %tmp493 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 45
+  store float %tmp322, float addrspace(1)* %tmp493, align 4
+  %tmp494 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 46
+  store float %tmp329, float addrspace(1)* %tmp494, align 4
+  %tmp495 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 47
+  store float %tmp336, float addrspace(1)* %tmp495, align 4
+  %tmp496 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 48
+  store float %tmp343, float addrspace(1)* %tmp496, align 4
+  %tmp497 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 49
+  store float %tmp350, float addrspace(1)* %tmp497, align 4
+  %tmp498 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 50
+  store float %tmp357, float addrspace(1)* %tmp498, align 4
+  %tmp499 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 51
+  store float %tmp364, float addrspace(1)* %tmp499, align 4
+  %tmp500 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 52
+  store float %tmp371, float addrspace(1)* %tmp500, align 4
+  %tmp501 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 53
+  store float %tmp378, float addrspace(1)* %tmp501, align 4
+  %tmp502 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 54
+  store float %tmp385, float addrspace(1)* %tmp502, align 4
+  %tmp503 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 55
+  store float %tmp392, float addrspace(1)* %tmp503, align 4
+  %tmp504 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 56
+  store float %tmp399, float addrspace(1)* %tmp504, align 4
+  %tmp505 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 57
+  store float %tmp406, float addrspace(1)* %tmp505, align 4
+  %tmp506 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 58
+  store float %tmp413, float addrspace(1)* %tmp506, align 4
+  %tmp507 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 59
+  store float %tmp420, float addrspace(1)* %tmp507, align 4
+  %tmp508 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 60
+  store float %tmp427, float addrspace(1)* %tmp508, align 4
+  %tmp509 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 61
+  store float %tmp434, float addrspace(1)* %tmp509, align 4
+  %tmp510 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 62
+  store float %tmp441, float addrspace(1)* %tmp510, align 4
+  %tmp511 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 63
+  store float %tmp448, float addrspace(1)* %tmp511, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
new file mode 100644
index 000000000000..0d19c1e6a8f3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -0,0 +1,288 @@
+; RUN: llc -march=amdgcn -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+
+; SI: NumSgprs: {{[1-9]$}}
+; SI: NumVgprs: {{[1-9]$}}
+
+; stores may alias loads
+; VI: NumSgprs: {{[1-5][0-9]$}}
+; VI: NumVgprs: {{[1-3][0-9]$}}
+
+define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
+bb:
+  %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004
+  %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252
+  %adr.c.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20508
+  %adr.a.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20772
+  %adr.b.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21020
+  %adr.c.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21276
+  %adr.a.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21540
+  %adr.b.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21788
+  %adr.c.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22044
+  %adr.a.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22308
+  %adr.b.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22556
+  %adr.c.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22812
+  %adr.a.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23076
+  %adr.b.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23324
+  %adr.c.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23580
+  %adr.a.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23844
+  %adr.b.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24092
+  %adr.c.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24348
+  %adr.a.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24612
+  %adr.b.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24860
+  %adr.c.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25116
+  %adr.a.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25380
+  %adr.b.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25628
+  %adr.c.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25884
+  %adr.a.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26148
+  %adr.b.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26396
+  %adr.c.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26652
+  %adr.a.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26916
+  %adr.b.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27164
+  %adr.c.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27420
+  %adr.a.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27684
+  %adr.b.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27932
+  %adr.c.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28188
+  %adr.a.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28452
+  %adr.b.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28700
+  %adr.c.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28956
+  %adr.a.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29220
+  %adr.b.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29468
+  %adr.c.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29724
+  %adr.a.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29988
+  %adr.b.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30236
+  %adr.c.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30492
+  %adr.a.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30756
+  %adr.b.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31004
+  %adr.c.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31260
+  %adr.a.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31524
+  %adr.b.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31772
+  %adr.c.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32028
+  %adr.a.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32292
+  %adr.b.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32540
+  %adr.c.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32796
+  %adr.a.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33060
+  %adr.b.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33308
+  %adr.c.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33564
+  %adr.a.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33828
+  %adr.b.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34076
+  %adr.c.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34332
+  %adr.a.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34596
+  %adr.b.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34844
+  %adr.c.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35100
+  %adr.a.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35364
+  %adr.b.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35612
+  %adr.c.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35868
+  %adr.a.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36132
+  %adr.b.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36380
+  %adr.c.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36636
+  %adr.a.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36900
+  %adr.b.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37148
+  %adr.c.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37404
+  %adr.a.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37668
+  %adr.b.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37916
+  %adr.c.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38172
+  %adr.a.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38436
+  %adr.b.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38684
+  %adr.c.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38940
+  %adr.a.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39204
+  %adr.b.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39452
+  %adr.c.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39708
+  %adr.a.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39972
+  %adr.b.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40220
+  %adr.c.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40476
+  %adr.a.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40740
+  %adr.b.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40988
+  %adr.c.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41244
+  %adr.a.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41508
+  %adr.b.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41756
+  %adr.c.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42012
+  %adr.a.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42276
+  %adr.b.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42524
+  %adr.c.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42780
+  %a.0 = load float, float addrspace(3)* %adr.a.0, align 4
+  %b.0 = load float, float addrspace(3)* %adr.b.0, align 4
+  %c.0 = load float, float addrspace(3)* %adr.c.0, align 4
+  %a.1 = load float, float addrspace(3)* %adr.a.1, align 4
+  %b.1 = load float, float addrspace(3)* %adr.b.1, align 4
+  %c.1 = load float, float addrspace(3)* %adr.c.1, align 4
+  %a.2 = load float, float addrspace(3)* %adr.a.2, align 4
+  %b.2 = load float, float addrspace(3)* %adr.b.2, align 4
+  %c.2 = load float, float addrspace(3)* %adr.c.2, align 4
+  %a.3 = load float, float addrspace(3)* %adr.a.3, align 4
+  %b.3 = load float, float addrspace(3)* %adr.b.3, align 4
+  %c.3 = load float, float addrspace(3)* %adr.c.3, align 4
+  %a.4 = load float, float addrspace(3)* %adr.a.4, align 4
+  %b.4 = load float, float addrspace(3)* %adr.b.4, align 4
+  %c.4 = load float, float addrspace(3)* %adr.c.4, align 4
+  %a.5 = load float, float addrspace(3)* %adr.a.5, align 4
+  %b.5 = load float, float addrspace(3)* %adr.b.5, align 4
+  %c.5 = load float, float addrspace(3)* %adr.c.5, align 4
+  %a.6 = load float, float addrspace(3)* %adr.a.6, align 4
+  %b.6 = load float, float addrspace(3)* %adr.b.6, align 4
+  %c.6 = load float, float addrspace(3)* %adr.c.6, align 4
+  %a.7 = load float, float addrspace(3)* %adr.a.7, align 4
+  %b.7 = load float, float addrspace(3)* %adr.b.7, align 4
+  %c.7 = load float, float addrspace(3)* %adr.c.7, align 4
+  %a.8 = load float, float addrspace(3)* %adr.a.8, align 4
+  %b.8 = load float, float addrspace(3)* %adr.b.8, align 4
+  %c.8 = load float, float addrspace(3)* %adr.c.8, align 4
+  %a.9 = load float, float addrspace(3)* %adr.a.9, align 4
+  %b.9 = load float, float addrspace(3)* %adr.b.9, align 4
+  %c.9 = load float, float addrspace(3)* %adr.c.9, align 4
+  %a.10 = load float, float addrspace(3)* %adr.a.10, align 4
+  %b.10 = load float, float addrspace(3)* %adr.b.10, align 4
+  %c.10 = load float, float addrspace(3)* %adr.c.10, align 4
+  %a.11 = load float, float addrspace(3)* %adr.a.11, align 4
+  %b.11 = load float, float addrspace(3)* %adr.b.11, align 4
+  %c.11 = load float, float addrspace(3)* %adr.c.11, align 4
+  %a.12 = load float, float addrspace(3)* %adr.a.12, align 4
+  %b.12 = load float, float addrspace(3)* %adr.b.12, align 4
+  %c.12 = load float, float addrspace(3)* %adr.c.12, align 4
+  %a.13 = load float, float addrspace(3)* %adr.a.13, align 4
+  %b.13 = load float, float addrspace(3)* %adr.b.13, align 4
+  %c.13 = load float, float addrspace(3)* %adr.c.13, align 4
+  %a.14 = load float, float addrspace(3)* %adr.a.14, align 4
+  %b.14 = load float, float addrspace(3)* %adr.b.14, align 4
+  %c.14 = load float, float addrspace(3)* %adr.c.14, align 4
+  %a.15 = load float, float addrspace(3)* %adr.a.15, align 4
+  %b.15 = load float, float addrspace(3)* %adr.b.15, align 4
+  %c.15 = load float, float addrspace(3)* %adr.c.15, align 4
+  %a.16 = load float, float addrspace(3)* %adr.a.16, align 4
+  %b.16 = load float, float addrspace(3)* %adr.b.16, align 4
+  %c.16 = load float, float addrspace(3)* %adr.c.16, align 4
+  %a.17 = load float, float addrspace(3)* %adr.a.17, align 4
+  %b.17 = load float, float addrspace(3)* %adr.b.17, align 4
+  %c.17 = load float, float addrspace(3)* %adr.c.17, align 4
+  %a.18 = load float, float addrspace(3)* %adr.a.18, align 4
+  %b.18 = load float, float addrspace(3)* %adr.b.18, align 4
+  %c.18 = load float, float addrspace(3)* %adr.c.18, align 4
+  %a.19 = load float, float addrspace(3)* %adr.a.19, align 4
+  %b.19 = load float, float addrspace(3)* %adr.b.19, align 4
+  %c.19 = load float, float addrspace(3)* %adr.c.19, align 4
+  %a.20 = load float, float addrspace(3)* %adr.a.20, align 4
+  %b.20 = load float, float addrspace(3)* %adr.b.20, align 4
+  %c.20 = load float, float addrspace(3)* %adr.c.20, align 4
+  %a.21 = load float, float addrspace(3)* %adr.a.21, align 4
+  %b.21 = load float, float addrspace(3)* %adr.b.21, align 4
+  %c.21 = load float, float addrspace(3)* %adr.c.21, align 4
+  %a.22 = load float, float addrspace(3)* %adr.a.22, align 4
+  %b.22 = load float, float addrspace(3)* %adr.b.22, align 4
+  %c.22 = load float, float addrspace(3)* %adr.c.22, align 4
+  %a.23 = load float, float addrspace(3)* %adr.a.23, align 4
+  %b.23 = load float, float addrspace(3)* %adr.b.23, align 4
+  %c.23 = load float, float addrspace(3)* %adr.c.23, align 4
+  %a.24 = load float, float addrspace(3)* %adr.a.24, align 4
+  %b.24 = load float, float addrspace(3)* %adr.b.24, align 4
+  %c.24 = load float, float addrspace(3)* %adr.c.24, align 4
+  %a.25 = load float, float addrspace(3)* %adr.a.25, align 4
+  %b.25 = load float, float addrspace(3)* %adr.b.25, align 4
+  %c.25 = load float, float addrspace(3)* %adr.c.25, align 4
+  %a.26 = load float, float addrspace(3)* %adr.a.26, align 4
+  %b.26 = load float, float addrspace(3)* %adr.b.26, align 4
+  %c.26 = load float, float addrspace(3)* %adr.c.26, align 4
+  %a.27 = load float, float addrspace(3)* %adr.a.27, align 4
+  %b.27 = load float, float addrspace(3)* %adr.b.27, align 4
+  %c.27 = load float, float addrspace(3)* %adr.c.27, align 4
+  %a.28 = load float, float addrspace(3)* %adr.a.28, align 4
+  %b.28 = load float, float addrspace(3)* %adr.b.28, align 4
+  %c.28 = load float, float addrspace(3)* %adr.c.28, align 4
+  %a.29 = load float, float addrspace(3)* %adr.a.29, align 4
+  %b.29 = load float, float addrspace(3)* %adr.b.29, align 4
+  %c.29 = load float, float addrspace(3)* %adr.c.29, align 4
+  %res.0 = tail call float @llvm.fmuladd.f32(float %a.0, float %b.0, float %c.0)
+  %res.1 = tail call float @llvm.fmuladd.f32(float %a.1, float %b.1, float %c.1)
+  %res.2 = tail call float @llvm.fmuladd.f32(float %a.2, float %b.2, float %c.2)
+  %res.3 = tail call float @llvm.fmuladd.f32(float %a.3, float %b.3, float %c.3)
+  %res.4 = tail call float @llvm.fmuladd.f32(float %a.4, float %b.4, float %c.4)
+  %res.5 = tail call float @llvm.fmuladd.f32(float %a.5, float %b.5, float %c.5)
+  %res.6 = tail call float @llvm.fmuladd.f32(float %a.6, float %b.6, float %c.6)
+  %res.7 = tail call float @llvm.fmuladd.f32(float %a.7, float %b.7, float %c.7)
+  %res.8 = tail call float @llvm.fmuladd.f32(float %a.8, float %b.8, float %c.8)
+  %res.9 = tail call float @llvm.fmuladd.f32(float %a.9, float %b.9, float %c.9)
+  %res.10 = tail call float @llvm.fmuladd.f32(float %a.10, float %b.10, float %c.10)
+  %res.11 = tail call float @llvm.fmuladd.f32(float %a.11, float %b.11, float %c.11)
+  %res.12 = tail call float @llvm.fmuladd.f32(float %a.12, float %b.12, float %c.12)
+  %res.13 = tail call float @llvm.fmuladd.f32(float %a.13, float %b.13, float %c.13)
+  %res.14 = tail call float @llvm.fmuladd.f32(float %a.14, float %b.14, float %c.14)
+  %res.15 = tail call float @llvm.fmuladd.f32(float %a.15, float %b.15, float %c.15)
+  %res.16 = tail call float @llvm.fmuladd.f32(float %a.16, float %b.16, float %c.16)
+  %res.17 = tail call float @llvm.fmuladd.f32(float %a.17, float %b.17, float %c.17)
+  %res.18 = tail call float @llvm.fmuladd.f32(float %a.18, float %b.18, float %c.18)
+  %res.19 = tail call float @llvm.fmuladd.f32(float %a.19, float %b.19, float %c.19)
+  %res.20 = tail call float @llvm.fmuladd.f32(float %a.20, float %b.20, float %c.20)
+  %res.21 = tail call float @llvm.fmuladd.f32(float %a.21, float %b.21, float %c.21)
+  %res.22 = tail call float @llvm.fmuladd.f32(float %a.22, float %b.22, float %c.22)
+  %res.23 = tail call float @llvm.fmuladd.f32(float %a.23, float %b.23, float %c.23)
+  %res.24 = tail call float @llvm.fmuladd.f32(float %a.24, float %b.24, float %c.24)
+  %res.25 = tail call float @llvm.fmuladd.f32(float %a.25, float %b.25, float %c.25)
+  %res.26 = tail call float @llvm.fmuladd.f32(float %a.26, float %b.26, float %c.26)
+  %res.27 = tail call float @llvm.fmuladd.f32(float %a.27, float %b.27, float %c.27)
+  %res.28 = tail call float @llvm.fmuladd.f32(float %a.28, float %b.28, float %c.28)
+  %res.29 = tail call float @llvm.fmuladd.f32(float %a.29, float %b.29, float %c.29)
+  %adr.res.0 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 0
+  %adr.res.1 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 2
+  %adr.res.2 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 4
+  %adr.res.3 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 6
+  %adr.res.4 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 8
+  %adr.res.5 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 10
+  %adr.res.6 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 12
+  %adr.res.7 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 14
+  %adr.res.8 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 16
+  %adr.res.9 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 18
+  %adr.res.10 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 20
+  %adr.res.11 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 22
+  %adr.res.12 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 24
+  %adr.res.13 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 26
+  %adr.res.14 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 28
+  %adr.res.15 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 30
+  %adr.res.16 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 32
+  %adr.res.17 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 34
+  %adr.res.18 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 36
+  %adr.res.19 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 38
+  %adr.res.20 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 40
+  %adr.res.21 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 42
+  %adr.res.22 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 44
+  %adr.res.23 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 46
+  %adr.res.24 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 48
+  %adr.res.25 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 50
+  %adr.res.26 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 52
+  %adr.res.27 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 54
+  %adr.res.28 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 56
+  %adr.res.29 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 58
+  store float %res.0, float addrspace(1)* %adr.res.0, align 4
+  store float %res.1, float addrspace(1)* %adr.res.1, align 4
+  store float %res.2, float addrspace(1)* %adr.res.2, align 4
+  store float %res.3, float addrspace(1)* %adr.res.3, align 4
+  store float %res.4, float addrspace(1)* %adr.res.4, align 4
+  store float %res.5, float addrspace(1)* %adr.res.5, align 4
+  store float %res.6, float addrspace(1)* %adr.res.6, align 4
+  store float %res.7, float addrspace(1)* %adr.res.7, align 4
+  store float %res.8, float addrspace(1)* %adr.res.8, align 4
+  store float %res.9, float addrspace(1)* %adr.res.9, align 4
+  store float %res.10, float addrspace(1)* %adr.res.10, align 4
+  store float %res.11, float addrspace(1)* %adr.res.11, align 4
+  store float %res.12, float addrspace(1)* %adr.res.12, align 4
+  store float %res.13, float addrspace(1)* %adr.res.13, align 4
+  store float %res.14, float addrspace(1)* %adr.res.14, align 4
+  store float %res.15, float addrspace(1)* %adr.res.15, align 4
+  store float %res.16, float addrspace(1)* %adr.res.16, align 4
+  store float %res.17, float addrspace(1)* %adr.res.17, align 4
+  store float %res.18, float addrspace(1)* %adr.res.18, align 4
+  store float %res.19, float addrspace(1)* %adr.res.19, align 4
+  store float %res.20, float addrspace(1)* %adr.res.20, align 4
+  store float %res.21, float addrspace(1)* %adr.res.21, align 4
+  store float %res.22, float addrspace(1)* %adr.res.22, align 4
+  store float %res.23, float addrspace(1)* %adr.res.23, align 4
+  store float %res.24, float addrspace(1)* %adr.res.24, align 4
+  store float %res.25, float addrspace(1)* %adr.res.25, align 4
+  store float %res.26, float addrspace(1)* %adr.res.26, align 4
+  store float %res.27, float addrspace(1)* %adr.res.27, align 4
+  store float %res.28, float addrspace(1)* %adr.res.28, align 4
+  store float %res.29, float addrspace(1)* %adr.res.29, align 4
+  ret void
+}
+declare float @llvm.fmuladd.f32(float, float, float) #0
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure.mir b/test/CodeGen/AMDGPU/schedule-regpressure.mir
new file mode 100644
index 000000000000..c71de87eeece
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-regpressure.mir
@@ -0,0 +1,57 @@
+# RUN: llc -march=amdgcn -misched=converge -run-pass machine-scheduler %s -o - -debug-only=misched 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Check there is no SReg_32 pressure created by DS_* instructions because of M0 use
+
+# CHECK: ScheduleDAGMILive::schedule starting
+# CHECK: SU({{.*}} = DS_READ_B32 {{.*}} %M0<imp-use>, %EXEC<imp-use>
+# CHECK: Pressure Diff : {{$}}
+# CHECK: SU({{.*}} DS_WRITE_B32
+
+---
+name:            mo_pset
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_128 }
+  - { id: 1, class: sgpr_64 }
+  - { id: 2, class: sreg_32_xm0 }
+  - { id: 3, class: sgpr_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+  - { id: 8, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr4_sgpr5', virtual-reg: '%1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0:
+    liveins: %sgpr4_sgpr5
+
+    %1 = COPY %sgpr4_sgpr5
+    %5 = S_LOAD_DWORD_IMM %1, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    %m0 = S_MOV_B32 -1
+    %7 = COPY %5
+    %6 = DS_READ_B32 %7, 0, 0, implicit %m0, implicit %exec
+    DS_WRITE_B32 %7, %6, 4, 0, implicit killed %m0, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
index 94101f0b92b6..6b1e85915a11 100644
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -9,11 +9,11 @@
 ; should be able to reuse the same regiser for each scratch buffer access.
 
 ; GCN-LABEL: {{^}}legal_offset_fi:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+$}}
-; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8000
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
-define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
@@ -49,11 +49,11 @@ done:
 ; GCN-LABEL: {{^}}legal_offset_fi_offset:
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 ; This constant isn't folded, because it has multiple uses.
-; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8000
+; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004
 ; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]]
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
-define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
@@ -88,7 +88,7 @@ done:
 
 ; GCN-LABEL: {{^}}neg_vaddr_offset:
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}}
-define void @neg_vaddr_offset(i32 %offset) {
+define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
   %ptr_offset = add i32 %offset, 4
@@ -98,8 +98,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}pos_vaddr_offset:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16
-define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20
+define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
   %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
index bafd6a50ccfe..f9ac425be794 100644
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -13,7 +13,7 @@
 
 ; FUNC-LABEL: {{^}}sdiv_i32:
 ; EG: CF_END
-define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in
   %den = load i32, i32 addrspace(1) * %den_ptr
@@ -23,7 +23,7 @@ define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 }
 
 ; FUNC-LABEL: {{^}}sdiv_i32_4:
-define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
@@ -43,14 +43,14 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_add_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 3435
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
@@ -59,14 +59,14 @@ define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = sdiv <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
@@ -75,7 +75,7 @@ define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -86,7 +86,7 @@ define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -100,7 +100,7 @@ define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
   %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
   %num = load i23, i23 addrspace(1) * %in
   %den = load i23, i23 addrspace(1) * %den_ptr
@@ -114,7 +114,7 @@ define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
   %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
   %num = load i24, i24 addrspace(1) * %in
   %den = load i24, i24 addrspace(1) * %den_ptr
@@ -126,7 +126,7 @@ define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}v_sdiv_i25:
 ; SI-NOT: v_rcp_f32
-define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
   %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
   %num = load i25, i25 addrspace(1) * %in
   %den = load i25, i25 addrspace(1) * %den_ptr
@@ -137,19 +137,19 @@ define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
 }
 
 ; Tests for 64-bit divide bypass.
-; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = sdiv i64 %a, %b
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = srem i64 %a, %b
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %resultdiv = sdiv i64 %a, %b
 ;   %resultrem = srem i64 %a, %b
 ;   %result = add i64 %resultdiv, %resultrem
@@ -163,7 +163,7 @@ define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
 ; SI: v_mul_hi_i32
 ; SI: v_mul_hi_i32
 
-define void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/sdivrem24.ll b/test/CodeGen/AMDGPU/sdivrem24.ll
index 349a7821da17..257e6be96b65 100644
--- a/test/CodeGen/AMDGPU/sdivrem24.ll
+++ b/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -12,7 +12,7 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -31,7 +31,7 @@ define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -50,7 +50,7 @@ define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -69,7 +69,7 @@ define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -88,7 +88,7 @@ define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -107,7 +107,7 @@ define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -130,7 +130,7 @@ define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -149,7 +149,7 @@ define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -168,7 +168,7 @@ define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -187,7 +187,7 @@ define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -206,7 +206,7 @@ define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -225,7 +225,7 @@ define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -244,7 +244,7 @@ define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -263,7 +263,7 @@ define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -283,7 +283,7 @@ define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -303,7 +303,7 @@ define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -323,7 +323,7 @@ define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
diff --git a/test/CodeGen/AMDGPU/sdivrem64.ll b/test/CodeGen/AMDGPU/sdivrem64.ll
index 28fdb69e1ada..5ad0d8efaed3 100644
--- a/test/CodeGen/AMDGPU/sdivrem64.ll
+++ b/test/CodeGen/AMDGPU/sdivrem64.ll
@@ -70,7 +70,7 @@
 ; SI-NOT: v_lshr_b64
 ; VI-NOT: v_lshrrev_b64
 ; GCN: s_endpgm
-define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = sdiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@ define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@ define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = sdiv i64 %1, %2
@@ -176,7 +176,7 @@ define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = srem i64 %1, %2
@@ -196,7 +196,7 @@ define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = sdiv i64 %1, %2
@@ -216,7 +216,7 @@ define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = srem i64 %1, %2
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll
new file mode 100644
index 000000000000..1e0ac3807528
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -0,0 +1,395 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}add_shr_i32:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
+; NOSDWA-NOT: v_add_i32_sdwa
+
+; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %a = load i32, i32 addrspace(1)* %in, align 4
+  %shr = lshr i32 %a, 16
+  %add = add i32 %a, %shr
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sub_shr_i32:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_subrev_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
+; NOSDWA-NOT: v_subrev_i32_sdwa
+
+; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %a = load i32, i32 addrspace(1)* %in, align 4
+  %shr = lshr i32 %a, 16
+  %sub = sub i32 %shr, %a
+  store i32 %sub, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_shr_i32:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]]
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
+  %a = load i32, i32 addrspace(1)* %in1, align 4
+  %b = load i32, i32 addrspace(1)* %in2, align 4
+  %shra = lshr i32 %a, 16
+  %shrb = lshr i32 %b, 16
+  %mul = mul i32 %shra, %shrb
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_i16:
+; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-NOT: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
+entry:
+  %a = load i16, i16 addrspace(1)* %ina, align 4
+  %b = load i16, i16 addrspace(1)* %inb, align 4
+  %mul = mul i16 %a, %b
+  store i16 %mul, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v2i16:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <2 x i16> %a, %b
+  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v4i16:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <4 x i16> %a, %b
+  store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v8i16:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4
+  %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <8 x i16> %a, %b
+  store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_half:
+; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-NOT: v_mul_f16_sdwa
+
+define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
+entry:
+  %a = load half, half addrspace(1)* %ina, align 4
+  %b = load half, half addrspace(1)* %inb, align 4
+  %mul = fmul half %a, %b
+  store half %mul, half addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v2half:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+
+; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
+define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
+  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <2 x half> %a, %b
+  store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v4half:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
+  %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <4 x half> %a, %b
+  store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v8half:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
+  %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <8 x half> %a, %b
+  store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_i8:
+; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-NOT: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
+entry:
+  %a = load i8, i8 addrspace(1)* %ina, align 4
+  %b = load i8, i8 addrspace(1)* %inb, align 4
+  %mul = mul i8 %a, %b
+  store i8 %mul, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v2i8:
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+
+define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
+  %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4
+  %mul = mul <2 x i8> %a, %b
+  store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v4i8:
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
+entry:
+  %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4
+  %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4
+  %mul = mul <4 x i8> %a, %b
+  store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v8i8:
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
+entry:
+  %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4
+  %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4
+  %mul = mul <8 x i8> %a, %b
+  store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16:
+; NOSDWA-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; NOSDWA-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_cvt_f32_i32_sdwa
+
+; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+
+define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
+    <2 x half> addrspace(1)* %r,
+    <2 x i16> addrspace(1)* %a) {
+entry:
+  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+  %r.val = sitofp <2 x i16> %a.val to <2 x half>
+  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}mac_v2half:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA-NOT: v_mac_f16_sdwa
+
+; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
+
+define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
+  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <2 x half> %a, %b
+  %mac = fadd <2 x half> %mul, %b
+  store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}immediate_mul_v2i16:
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA-NOT: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
+  %mul = mul <2 x i16> %a, <i16 123, i16 321>
+  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; Double use of same src - should not convert it
+; GCN-LABEL: {{^}}mulmul_v2i16:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <2 x i16> %a, %b
+  %mul2 = mul <2 x i16> %mul, %b
+  store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_bb_v2i16:
+; NOSDWA-NOT: v_add_i32_sdwa
+
+; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  br label %add_label
+add_label:
+  %add = add <2 x i16> %a, %b
+  br label %store_label
+store_label:
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
index 559d464f36a5..c8839c17365e 100644
--- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
+define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -29,7 +29,7 @@ define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
+define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0)
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index 73dadde884ae..3417eb02b361 100644
--- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}add_select_fabs_fabs_f32:
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
@@ -8,7 +8,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
-define void @add_select_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -30,7 +30,7 @@ define void @add_select_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]]
-define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -57,7 +57,7 @@ define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
 
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[X_ABS]]
-define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -80,7 +80,7 @@ define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]]
-define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -104,7 +104,7 @@ define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_var_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -123,7 +123,7 @@ define void @add_select_fabs_var_f32(i32 %c) #0 {
 ; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_fabs_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -140,7 +140,7 @@ define void @add_select_fabs_negk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
-define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -155,7 +155,7 @@ define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_posk_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float 2.0, float 1.0
@@ -172,7 +172,7 @@ define void @add_select_posk_posk_f32(i32 %c) #0 {
 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_negk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -192,7 +192,7 @@ define void @add_select_negk_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -209,7 +209,7 @@ define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
-define void @add_select_fabs_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
 
@@ -228,7 +228,7 @@ define void @add_select_fabs_posk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
-define void @add_select_posk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -246,7 +246,7 @@ define void @add_select_posk_fabs_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -268,7 +268,7 @@ define void @add_select_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
-define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -295,7 +295,7 @@ define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
 
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[NEG_X]]
-define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -318,7 +318,7 @@ define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
-define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -342,7 +342,7 @@ define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fneg_var_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -360,7 +360,7 @@ define void @add_select_fneg_var_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -372,13 +372,13 @@ define void @add_select_fneg_negk_f32(i32 %c) #0 {
 }
 
 ; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -390,15 +390,15 @@ define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
 }
 
 ; GCN-LABEL: {{^}}add_select_fneg_neginv2pi_f32:
-; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
 
 ; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
 ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
 
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -415,7 +415,7 @@ define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -432,7 +432,7 @@ define void @add_select_negk_negk_f32(i32 %c) #0 {
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2048.0, float -4096.0
@@ -446,7 +446,7 @@ define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
-define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -463,7 +463,7 @@ define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_negk_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -480,7 +480,7 @@ define void @add_select_negk_fneg_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -498,7 +498,7 @@ define void @add_select_fneg_posk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_posk_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -518,7 +518,7 @@ define void @add_select_posk_fneg_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -541,7 +541,7 @@ define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -564,7 +564,7 @@ define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_neg_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -586,7 +586,7 @@ define void @add_select_neg_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_neg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -607,7 +607,7 @@ define void @add_select_fabs_neg_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_neg_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -629,7 +629,7 @@ define void @add_select_neg_negfabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_negfabs_neg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -651,7 +651,7 @@ define void @add_select_negfabs_neg_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
-define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -672,7 +672,7 @@ define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
-define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -690,7 +690,7 @@ define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
-define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -709,7 +709,7 @@ define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
-define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -732,7 +732,7 @@ define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
 ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_add_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -749,7 +749,7 @@ define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
 ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %add = fsub float %x, 4.0
@@ -765,7 +765,7 @@ define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %mul = fmul float %x, 4.0
@@ -782,7 +782,7 @@ define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
 ; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -799,7 +799,7 @@ define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -818,7 +818,7 @@ define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
diff --git a/test/CodeGen/AMDGPU/select-i1.ll b/test/CodeGen/AMDGPU/select-i1.ll
index 07dcb2153384..5eaad1f363f9 100644
--- a/test/CodeGen/AMDGPU/select-i1.ll
+++ b/test/CodeGen/AMDGPU/select-i1.ll
@@ -6,7 +6,7 @@
 ; FUNC-LABEL: {{^}}select_i1:
 ; SI: v_cndmask_b32
 ; SI-NOT: v_cndmask_b32
-define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
+define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
@@ -19,7 +19,7 @@ define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind
 ; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
 ; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]]
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
-define void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
+define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
   %cmp = icmp slt i1 %cond, false
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/select-opt.ll b/test/CodeGen/AMDGPU/select-opt.ll
index ad358d33c405..d56b952118b5 100644
--- a/test/CodeGen/AMDGPU/select-opt.ll
+++ b/test/CodeGen/AMDGPU/select-opt.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
@@ -27,7 +27,7 @@ define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b,
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
@@ -43,7 +43,7 @@ define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
@@ -59,7 +59,7 @@ define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b,
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
@@ -76,7 +76,7 @@ define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: s_endpgm
-define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
@@ -92,7 +92,7 @@ define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
@@ -108,7 +108,7 @@ define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
@@ -124,7 +124,7 @@ define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
@@ -138,7 +138,7 @@ define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %
 ; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
-define void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
+define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
 entry:
   %cmp0 = fcmp oeq float %c0, 1.0
   br i1 %cmp0, label %if0, label %endif
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index 759abe2f2e9a..8710fc8c7307 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -10,7 +10,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
+define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
@@ -22,7 +22,7 @@ define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b,
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
@@ -36,7 +36,7 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16>
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
@@ -49,7 +49,7 @@ define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx4
-define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
@@ -64,7 +64,7 @@ define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: buffer_store_dwordx4
-define void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
@@ -82,7 +82,7 @@ bb:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
   store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
@@ -102,7 +102,7 @@ define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32>
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
   store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
@@ -120,7 +120,7 @@ define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x
 ; SI: v_cndmask_b32_e32
 
 ; SI: buffer_store_dwordx4
-define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
   store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
@@ -135,7 +135,7 @@ define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: buffer_store_dwordx4
-define void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
   %val = load <4 x float>, <4 x float> addrspace(1)* %in
@@ -153,7 +153,7 @@ bb:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
   store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
@@ -165,7 +165,7 @@ define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x f
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
   store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
@@ -181,7 +181,7 @@ define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
   store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
@@ -205,7 +205,7 @@ define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
   store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll
index 19fe8d9b2326..2a7a9c9e0638 100644
--- a/test/CodeGen/AMDGPU/select.f16.ll
+++ b/test/CodeGen/AMDGPU/select.f16.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-; GCN-LABEL: {{^}}select_f16
+; GCN-LABEL: {{^}}select_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
@@ -17,7 +17,7 @@
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16(
+define amdgpu_kernel void @select_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -34,13 +34,12 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}select_f16_imm_a
+; GCN-LABEL: {{^}}select_f16_imm_a:
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3800{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_gt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
+; SI:  v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
 ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
@@ -49,7 +48,7 @@ entry:
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_a(
+define amdgpu_kernel void @select_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c,
@@ -64,22 +63,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}select_f16_imm_b
+; GCN-LABEL: {{^}}select_f16_imm_b:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x3800{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
+; SI:  v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
 ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
+
 ; VI:  v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]]
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_b(
+define amdgpu_kernel void @select_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c,
@@ -94,23 +93,23 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}select_f16_imm_c
+; GCN-LABEL: {{^}}select_f16_imm_c:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], 0x3800{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
 ; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[C_F32]], v[[D_F32]], vcc
+; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
+
 ; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
 ; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}}
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_c(
+define amdgpu_kernel void @select_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -125,23 +124,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}select_f16_imm_d
+; GCN-LABEL: {{^}}select_f16_imm_d:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], 0x3800{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
 ; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
+; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
 ; VI:  v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}}
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_d(
+define amdgpu_kernel void @select_f16_imm_d(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -156,21 +154,25 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}select_v2f16
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_lt_f32_e64
-; SI:  v_cmp_lt_f32_e32
-; VI:  v_cmp_lt_f16_e32
-; VI:  v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
-; SI:  v_cvt_f16_f32_e32
-; SI:  v_cvt_f16_f32_e32
+; GCN-LABEL: {{^}}select_v2f16:
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cmp_lt_f32_e64
+; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e64
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+
+; VI: v_cmp_lt_f16_e64
+; VI: v_cmp_lt_f16_e32
+; VI: v_cndmask_b32_e64
+; VI: v_cndmask_b32_e32
+
 ; GCN: s_endpgm
-define void @select_v2f16(
+define amdgpu_kernel void @select_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -187,25 +189,24 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}select_v2f16_imm_a
+; GCN-LABEL: {{^}}select_v2f16_imm_a:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_gt_f32_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_gt_f32_e64
+; SI:  v_cmp_lt_f32_e64
+; SI:  v_cmp_lt_f32_e32 vcc, 0.5
+
 ; VI:  v_cmp_lt_f16_e32
 ; VI:  v_cmp_lt_f16_e64
 ; GCN: v_cndmask_b32_e32
-; SI:  v_cvt_f16_f32_e32
 ; GCN: v_cndmask_b32_e64
 ; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_a(
+define amdgpu_kernel void @select_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c,
@@ -220,25 +221,25 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}select_v2f16_imm_b
+; GCN-LABEL: {{^}}select_v2f16_imm_b:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_lt_f32_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_lt_f32_e64
+; SI:  v_cmp_gt_f32_e64
+; SI:  v_cmp_gt_f32_e32 vcc, 0.5
+
 ; VI:  v_cmp_gt_f16_e32
 ; VI:  v_cmp_gt_f16_e64
 ; GCN: v_cndmask_b32_e32
-; SI:  v_cvt_f16_f32_e32
 ; GCN: v_cndmask_b32_e64
+
+; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_b(
+define amdgpu_kernel void @select_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %c,
@@ -253,9 +254,7 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}select_v2f16_imm_c
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
+; GCN-LABEL: {{^}}select_v2f16_imm_c:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
@@ -263,10 +262,10 @@ entry:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 
-; SI: v_cmp_lt_f32_e32
-; SI: v_cmp_lt_f32_e64
-; SI: v_cndmask_b32_e32
+; SI: v_cmp_nlt_f32_e32
+; SI: v_cmp_nlt_f32_e64
 ; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
 
 ; VI: v_cmp_nlt_f16_e32
 ; VI: v_cndmask_b32_e32
@@ -277,7 +276,7 @@ entry:
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_c(
+define amdgpu_kernel void @select_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -292,25 +291,24 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}select_v2f16_imm_d
-; SI:  v_cvt_f32_f16_e32
+; GCN-LABEL: {{^}}select_v2f16_imm_d:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_lt_f32_e32
 ; SI:  v_cmp_lt_f32_e64
+; SI:  v_cmp_lt_f32_e32
+
 ; VI:  v_cmp_lt_f16_e32
 ; VI:  v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_d(
+define amdgpu_kernel void @select_v2f16_imm_d(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/test/CodeGen/AMDGPU/select.ll b/test/CodeGen/AMDGPU/select.ll
index 45f3cd5a7ac5..e53c159a2f71 100644
--- a/test/CodeGen/AMDGPU/select.ll
+++ b/test/CodeGen/AMDGPU/select.ll
@@ -14,7 +14,7 @@
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
+define amdgpu_kernel void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
                      <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
                      <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
                      i32 %cond) {
diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll
index a68fdecb00af..3b4c925a87a0 100644
--- a/test/CodeGen/AMDGPU/select64.ll
+++ b/test/CodeGen/AMDGPU/select64.ll
@@ -7,7 +7,7 @@
 ; CHECK-NOT: s_lshr_b64
 ; CHECK: v_cndmask
 ; CHECK: v_cndmask
-define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
+define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
 entry:
   %0 = icmp ugt i32 %cond, 5
   %1 = select i1 %0, i64 0, i64 %in
@@ -18,7 +18,7 @@ entry:
 ; CHECK-LABEL: {{^}}select_trunc_i64:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
+define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 0, i64 %in
   %trunc = trunc i64 %sel to i32
@@ -29,7 +29,7 @@ define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwi
 ; CHECK-LABEL: {{^}}select_trunc_i64_2:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 %a, i64 %b
   %trunc = trunc i64 %sel to i32
@@ -40,7 +40,7 @@ define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %
 ; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
@@ -54,7 +54,7 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa
 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
 ; CHECK: s_endpgm
-define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
diff --git a/test/CodeGen/AMDGPU/selectcc-cnd.ll b/test/CodeGen/AMDGPU/selectcc-cnd.ll
index 94d0ace75697..18616851c9c2 100644
--- a/test/CodeGen/AMDGPU/selectcc-cnd.ll
+++ b/test/CodeGen/AMDGPU/selectcc-cnd.ll
@@ -3,7 +3,7 @@
 ;CHECK-NOT: SETE
 ;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
 ;CHECK: 1073741824
-define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
   %1 = load float, float addrspace(1)* %in
   %2 = fcmp oeq float %1, 0.0
   %3 = select i1 %2, float 1.0, float 2.0
diff --git a/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
index 58a4ee7d62b2..1504165d3d2b 100644
--- a/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
+++ b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
@@ -3,7 +3,7 @@
 ;CHECK-NOT: SETE_INT
 ;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
 ;CHECK-NEXT: 2
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %1 = load i32, i32 addrspace(1)* %in
   %2 = icmp eq i32 %1, 0
   %3 = select i1 %2, i32 1, i32 2
diff --git a/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
index e870ee891e66..7af5478600bb 100644
--- a/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
+++ b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: -1
 ; Test a selectcc with i32 LHS/RHS and float True/False
 
-define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32, i32 addrspace(1)* %in
   %1 = icmp sge i32 %0, 0
diff --git a/test/CodeGen/AMDGPU/selectcc-opt.ll b/test/CodeGen/AMDGPU/selectcc-opt.ll
index 0f46d4c7ea06..8fef3f8b3808 100644
--- a/test/CodeGen/AMDGPU/selectcc-opt.ll
+++ b/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -7,7 +7,7 @@
 ; EG-NOT: CND
 ; EG: SET{{[NEQGTL]+}}_DX10
 
-define void @test_a(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.000000e+00
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -35,7 +35,7 @@ ENDIF:
 ; EG: SET{{[GTEQN]+}}_DX10
 ; EG-NEXT: PRED_
 ; EG-NEXT: ALU clause starting
-define void @test_b(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_b(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -59,7 +59,7 @@ ENDIF:
 ; Test a CND*_INT instruction with float true/false values
 ; EG-LABEL: {{^}}test_c:
 ; EG: CND{{[GTE]+}}_INT
-define void @test_c(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test_c(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   %1 = select i1 %0, float 2.0, float 3.0
@@ -72,7 +72,7 @@ entry:
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI-NOT: cmp
 ; SI-NOT: cndmask
-define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = select i1 %icmp0, i32 -1, i32 0
   store i32 %ext, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/selectcc.ll b/test/CodeGen/AMDGPU/selectcc.ll
index 446d4ab344b2..7eca22913987 100644
--- a/test/CodeGen/AMDGPU/selectcc.ll
+++ b/test/CodeGen/AMDGPU/selectcc.ll
@@ -11,7 +11,7 @@
 ; SI: v_cmp_eq_u64
 ; SI: v_cndmask
 ; SI: v_cndmask
-define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
+define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs
   %1 = select i1 %0, i64 %true, i64 %false
diff --git a/test/CodeGen/AMDGPU/selected-stack-object.ll b/test/CodeGen/AMDGPU/selected-stack-object.ll
index 37f2747d9815..50ca59ace94e 100644
--- a/test/CodeGen/AMDGPU/selected-stack-object.ll
+++ b/test/CodeGen/AMDGPU/selected-stack-object.ll
@@ -1,4 +1,4 @@
-; "Assertion failure" should be caught with both XFAIL:* and +Asserts.
+; "Assertion failure" should be caught with both XFAIL * and +Asserts.
 ; XFAIL: *
 ; REQUIRES: asserts
 
diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll
index 57365a6e1fc3..6867c6394937 100644
--- a/test/CodeGen/AMDGPU/set-dx10.ll
+++ b/test/CodeGen/AMDGPU/set-dx10.ll
@@ -8,7 +8,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp une float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -22,7 +22,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp une float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -34,7 +34,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -48,7 +48,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -60,7 +60,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -74,7 +74,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -86,7 +86,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -100,7 +100,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -112,7 +112,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -126,7 +126,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -138,7 +138,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -152,7 +152,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
diff --git a/test/CodeGen/AMDGPU/setcc-equivalent.ll b/test/CodeGen/AMDGPU/setcc-equivalent.ll
index 11ea793650c4..853afa8772ea 100644
--- a/test/CodeGen/AMDGPU/setcc-equivalent.ll
+++ b/test/CodeGen/AMDGPU/setcc-equivalent.ll
@@ -3,7 +3,7 @@
 ; EG-LABEL: {{^}}and_setcc_setcc_i32:
 ; EG: AND_INT
 ; EG-NEXT: SETE_INT
-define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %cmp1 = icmp eq i32 %a, -1
   %cmp2 = icmp eq i32 %b, -1
   %and = and i1 %cmp1, %cmp2
@@ -20,7 +20,7 @@ define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; EG: SETE_INT
 ; EG: AND_INT
 ; EG: SETE_INT
-define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+define amdgpu_kernel void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
   %cmp1 = icmp eq <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %cmp2 = icmp eq <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   %and = and <4 x i1> %cmp1, %cmp2
diff --git a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
new file mode 100644
index 000000000000..8d455d84bf9e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
@@ -0,0 +1,258 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+
+; Test fcmp pred (fneg x), c -> fcmp (swapped pred) x, -c combine.
+
+; GCN-LABEL: {{^}}multi_use_fneg_src:
+; GCN: buffer_load_dword [[A:v[0-9]+]]
+; GCN: buffer_load_dword [[B:v[0-9]+]]
+; GCN: buffer_load_dword [[C:v[0-9]+]]
+
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @multi_use_fneg_src() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %b = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+
+  %mul = fmul float %a, %b
+  %neg.mul = fsub float -0.0, %mul
+  %cmp = fcmp oeq float %neg.mul, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile float %mul, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}multi_foldable_use_fneg_src:
+; GCN: buffer_load_dword [[A:v[0-9]+]]
+; GCN: buffer_load_dword [[B:v[0-9]+]]
+; GCN: buffer_load_dword [[C:v[0-9]+]]
+
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]]
+; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]]
+define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %b = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+
+  %mul = fmul float %a, %b
+  %neg.mul = fsub float -0.0, %mul
+  %use1 = fmul float %mul, %neg.mul
+  %cmp = fcmp oeq float %neg.mul, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+
+  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile float %use1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}multi_use_fneg:
+; GCN: buffer_load_dword [[A:v[0-9]+]]
+; GCN: buffer_load_dword [[B:v[0-9]+]]
+; GCN: buffer_load_dword [[C:v[0-9]+]]
+
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 4.0, [[MUL]]
+; GCN-NOT: xor
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @multi_use_fneg() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %b = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+
+  %mul = fmul float %a, %b
+  %neg.mul = fsub float -0.0, %mul
+  %cmp = fcmp oeq float %neg.mul, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile float %neg.mul, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}multi_foldable_use_fneg:
+; GCN: buffer_load_dword [[A:v[0-9]+]]
+; GCN: buffer_load_dword [[B:v[0-9]+]]
+
+; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
+; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]]
+; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]]
+; GCN: buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @multi_foldable_use_fneg() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %b = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %z = load volatile i32, i32 addrspace(1)* undef
+
+  %mul = fmul float %a, %b
+  %neg.mul = fsub float -0.0, %mul
+  %cmp = fcmp oeq float %neg.mul, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  %use1 = fmul float %neg.mul, %mul
+  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile float %use1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_oeq_posk_f32:
+; GCN: v_cmp_eq_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_oeq_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp oeq float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_ogt_posk_f32:
+; GCN: v_cmp_gt_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_ogt_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp ogt float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_oge_posk_f32:
+; GCN: v_cmp_ge_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_oge_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp oge float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_olt_posk_f32:
+; GCN: v_cmp_lt_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_olt_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp olt float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_ole_posk_f32:
+; GCN: v_cmp_le_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_ole_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp ole float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_one_posk_f32:
+; GCN: v_cmp_lg_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_one_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp one float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_ueq_posk_f32:
+; GCN: v_cmp_nlg_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_ueq_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp ueq float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_ugt_posk_f32:
+; GCN: v_cmp_nle_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_ugt_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp ugt float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_uge_posk_f32:
+; GCN: v_cmp_nlt_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_uge_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp uge float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_ult_posk_f32:
+; GCN: v_cmp_nge_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_ult_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp ult float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_ule_posk_f32:
+; GCN: v_cmp_ngt_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_ule_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp ule float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_setcc_fneg_une_posk_f32:
+; GCN: v_cmp_neq_f32_e32 vcc, -4.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_setcc_fneg_une_posk_f32() #0 {
+  %a = load volatile float, float addrspace(1)* undef
+  %x = load volatile i32, i32 addrspace(1)* undef
+  %y = load volatile i32, i32 addrspace(1)* undef
+  %neg.a = fsub float -0.0, %a
+  %cmp = fcmp une float %neg.a, 4.0
+  %select = select i1 %cmp, i32 %x, i32 %y
+  store volatile i32 %select, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll
index 4ab6da085634..caddb6f68218 100644
--- a/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -11,7 +11,7 @@
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
@@ -28,7 +28,7 @@ define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
@@ -42,7 +42,7 @@ define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
@@ -56,7 +56,7 @@ define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
@@ -70,7 +70,7 @@ define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
@@ -84,7 +84,7 @@ define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
@@ -98,7 +98,7 @@ define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
@@ -111,7 +111,7 @@ define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN: v_cmp_eq_u32_e32 vcc,
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
-define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
@@ -124,7 +124,7 @@ define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
@@ -137,7 +137,7 @@ define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
@@ -159,7 +159,7 @@ define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, 255
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -172,7 +172,7 @@ define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
   %b = load i8, i8 addrspace(1)* %b.ptr
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
@@ -186,7 +186,7 @@ define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nou
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -207,7 +207,7 @@ define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) n
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -218,7 +218,7 @@ define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -229,7 +229,7 @@ define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
@@ -241,7 +241,7 @@ define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 2
@@ -256,7 +256,7 @@ define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
@@ -267,7 +267,7 @@ define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
@@ -278,7 +278,7 @@ define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
index 10d04bab9f6b..add90e9c2f3a 100644
--- a/test/CodeGen/AMDGPU/setcc.ll
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; GCN-DAG: v_cmp_eq_u32_e32
 ; GCN-DAG: v_cmp_eq_u32_e64
-define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %result = icmp eq <2 x i32> %a, %b
   %sext = sext <2 x i1> %result to <2 x i32>
   store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cmp_eq_u32_e64
-define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -43,7 +43,7 @@ define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 ; FUNC-LABEL: {{^}}f32_oeq:
 ; R600: SETE_DX10
 ; GCN: v_cmp_eq_f32
-define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oeq float %a, %b
   %1 = sext i1 %0 to i32
@@ -54,7 +54,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_ogt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_gt_f32
-define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ogt float %a, %b
   %1 = sext i1 %0 to i32
@@ -65,7 +65,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_oge:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_ge_f32
-define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oge float %a, %b
   %1 = sext i1 %0 to i32
@@ -76,7 +76,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_olt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_lt_f32
-define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp olt float %a, %b
   %1 = sext i1 %0 to i32
@@ -87,7 +87,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_ole:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_le_f32
-define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ole float %a, %b
   %1 = sext i1 %0 to i32
@@ -105,7 +105,7 @@ entry:
 
 ; GCN: v_cmp_lg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp one float %a, %b
   %1 = sext i1 %0 to i32
@@ -119,7 +119,7 @@ entry:
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
 ; GCN: v_cmp_o_f32
-define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ord float %a, %b
   %1 = sext i1 %0 to i32
@@ -137,7 +137,7 @@ entry:
 
 ; GCN: v_cmp_nlg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ueq float %a, %b
   %1 = sext i1 %0 to i32
@@ -150,7 +150,7 @@ entry:
 ; R600: SETE_DX10
 ; GCN: v_cmp_nle_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ugt float %a, %b
   %1 = sext i1 %0 to i32
@@ -164,7 +164,7 @@ entry:
 
 ; GCN: v_cmp_nlt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uge float %a, %b
   %1 = sext i1 %0 to i32
@@ -178,7 +178,7 @@ entry:
 
 ; GCN: v_cmp_nge_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ult float %a, %b
   %1 = sext i1 %0 to i32
@@ -192,7 +192,7 @@ entry:
 
 ; GCN: v_cmp_ngt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ule float %a, %b
   %1 = sext i1 %0 to i32
@@ -203,7 +203,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_une:
 ; R600: SETNE_DX10
 ; GCN: v_cmp_neq_f32
-define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp une float %a, %b
   %1 = sext i1 %0 to i32
@@ -217,7 +217,7 @@ entry:
 ; R600: OR_INT
 ; R600: SETNE_INT
 ; GCN: v_cmp_u_f32
-define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uno float %a, %b
   %1 = sext i1 %0 to i32
@@ -232,7 +232,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_eq:
 ; R600: SETE_INT
 ; GCN: v_cmp_eq_u32
-define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp eq i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -243,7 +243,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ne:
 ; R600: SETNE_INT
 ; GCN: v_cmp_ne_u32
-define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ne i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -254,7 +254,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ugt:
 ; R600: SETGT_UINT
 ; GCN: v_cmp_gt_u32
-define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ugt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -265,7 +265,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_uge:
 ; R600: SETGE_UINT
 ; GCN: v_cmp_ge_u32
-define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp uge i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -276,7 +276,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ult:
 ; R600: SETGT_UINT
 ; GCN: v_cmp_lt_u32
-define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ult i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -287,7 +287,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ule:
 ; R600: SETGE_UINT
 ; GCN: v_cmp_le_u32
-define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ule i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -298,7 +298,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_sgt:
 ; R600: SETGT_INT
 ; GCN: v_cmp_gt_i32
-define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sgt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -309,7 +309,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_sge:
 ; R600: SETGE_INT
 ; GCN: v_cmp_ge_i32
-define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sge i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -320,7 +320,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_slt:
 ; R600: SETGT_INT
 ; GCN: v_cmp_lt_i32
-define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp slt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -331,7 +331,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_sle:
 ; R600: SETGE_INT
 ; GCN: v_cmp_le_i32
-define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sle i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -348,7 +348,7 @@ entry:
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid
   %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid
@@ -369,7 +369,7 @@ define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptr
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid
   %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid
@@ -386,7 +386,7 @@ define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra,
 ; FUNC-LABEL: setcc-i1
 ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1
 ; GCN: s_cmp_eq_u32 [[AND]], 0
-define void @setcc-i1(i32 %in) #0 {
+define amdgpu_kernel void @setcc-i1(i32 %in) #0 {
   %and = and i32 %in, 1
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %endif, label %if
@@ -400,7 +400,7 @@ endif:
 ; GCN-DAG: v_cmp_ge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_cmp_le_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
 ; GCN: s_and_b64 s[2:3], [[A]], [[B]]
-define void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
+define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
 bb0:
   %tmp5 = fcmp oge float %cond, 0.000000e+00
   %tmp7 = fcmp ole float %cond, 1.000000e+00
diff --git a/test/CodeGen/AMDGPU/setcc64.ll b/test/CodeGen/AMDGPU/setcc64.ll
index 1f86277e0bc6..1f1bdb055302 100644
--- a/test/CodeGen/AMDGPU/setcc64.ll
+++ b/test/CodeGen/AMDGPU/setcc64.ll
@@ -9,7 +9,7 @@
 
 ; GCN-LABEL: {{^}}f64_oeq:
 ; GCN: v_cmp_eq_f64
-define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oeq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -19,7 +19,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_ogt:
 ; GCN: v_cmp_gt_f64
-define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ogt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -29,7 +29,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_oge:
 ; GCN: v_cmp_ge_f64
-define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -39,7 +39,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_olt:
 ; GCN: v_cmp_lt_f64
-define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp olt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -49,7 +49,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_ole:
 ; GCN: v_cmp_le_f64
-define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ole double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -60,7 +60,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_one:
 ; GCN: v_cmp_lg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp one double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -70,7 +70,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_ord:
 ; GCN: v_cmp_o_f64
-define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ord double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -81,7 +81,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_ueq:
 ; GCN: v_cmp_nlg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ueq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -93,7 +93,7 @@ entry:
 
 ; GCN: v_cmp_nle_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ugt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -104,7 +104,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_uge:
 ; GCN: v_cmp_nlt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -115,7 +115,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_ult:
 ; GCN: v_cmp_nge_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ult double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -126,7 +126,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_ule:
 ; GCN: v_cmp_ngt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ule double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -136,7 +136,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_une:
 ; GCN: v_cmp_neq_f64
-define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp une double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -146,7 +146,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_uno:
 ; GCN: v_cmp_u_f64
-define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uno double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -160,7 +160,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_eq:
 ; GCN: v_cmp_eq_u64
-define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp eq i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -170,7 +170,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ne:
 ; GCN: v_cmp_ne_u64
-define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ne i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -180,7 +180,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ugt:
 ; GCN: v_cmp_gt_u64
-define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ugt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -190,7 +190,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_uge:
 ; GCN: v_cmp_ge_u64
-define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp uge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -200,7 +200,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ult:
 ; GCN: v_cmp_lt_u64
-define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ult i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -210,7 +210,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ule:
 ; GCN: v_cmp_le_u64
-define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ule i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -220,7 +220,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_sgt:
 ; GCN: v_cmp_gt_i64
-define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sgt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -230,7 +230,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_sge:
 ; GCN: v_cmp_ge_i64
-define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -240,7 +240,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_slt:
 ; GCN: v_cmp_lt_i64
-define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp slt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -250,7 +250,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_sle:
 ; GCN: v_cmp_le_i64
-define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sle i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
diff --git a/test/CodeGen/AMDGPU/seto.ll b/test/CodeGen/AMDGPU/seto.ll
index 01e4a7fda5d2..b4385aa0ccca 100644
--- a/test/CodeGen/AMDGPU/seto.ll
+++ b/test/CodeGen/AMDGPU/seto.ll
@@ -4,12 +4,9 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
 ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
-define void @main(float %p) {
+define amdgpu_ps float @main(float inreg %p) {
 main_body:
   %c = fcmp oeq float %p, %p
   %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
-  ret void
+  ret float %r
 }
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/setuo.ll b/test/CodeGen/AMDGPU/setuo.ll
index 76346c4f624a..f6821b675e22 100644
--- a/test/CodeGen/AMDGPU/setuo.ll
+++ b/test/CodeGen/AMDGPU/setuo.ll
@@ -4,12 +4,9 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
 ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
-define void @main(float %p) {
+define amdgpu_ps float @main(float inreg %p) {
 main_body:
   %c = fcmp une float %p, %p
   %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
-  ret void
+  ret float %r
 }
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/sext-eliminate.ll b/test/CodeGen/AMDGPU/sext-eliminate.ll
index 7dc6eb87f6b5..0b780af17bca 100644
--- a/test/CodeGen/AMDGPU/sext-eliminate.ll
+++ b/test/CodeGen/AMDGPU/sext-eliminate.ll
@@ -6,7 +6,7 @@
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: SUB_INT {{[* ]*}}[[RES]]
 ; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+define amdgpu_kernel void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
   %sext = sext i1 %a to i32
   %res = add i32 %b, %sext
   store i32 %res, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@ define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT {{[* ]*}}[[RES]]
 ; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+define amdgpu_kernel void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
   %sext = sext i1 %a to i32
   %res = sub i32 %b, %sext
   store i32 %res, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
index adba6bbb51d4..7ac4e1d9fe4b 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
@@ -11,7 +11,7 @@
 ; EG: LSHR {{\*?}} [[ADDR]]
 
 ; Works with the align 2 removed
-define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
   %y = ashr <2 x i32> %x, <i32 7, i32 7>
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 4c58261709c4..b702e1c07200 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,8 +1,10 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FIXME: i16 promotion pass ruins the scalar cases when legal.
+; FIXME: r600 fails verifier
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
 ; GCN: s_load_dword [[ARG:s[0-9]+]],
@@ -13,7 +15,7 @@
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: LSHR * [[ADDR]]
 ; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
-define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
   store i32 %sext, i32 addrspace(1)* %out
@@ -30,7 +32,7 @@ define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 24
   %ashr = ashr i32 %shl, 24
@@ -48,7 +50,7 @@ define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 16
   %ashr = ashr i32 %shl, 16
@@ -66,7 +68,7 @@ define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %c = add <1 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <1 x i32> %c, <i32 24>
   %ashr = ashr <1 x i32> %shl, <i32 24>
@@ -80,7 +82,7 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a,
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 63
   %ashr = ashr i64 %shl, 63
@@ -94,7 +96,7 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 56
   %ashr = ashr i64 %shl, 56
@@ -109,7 +111,7 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
-define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 48
   %ashr = ashr i64 %shl, 48
@@ -123,7 +125,7 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 32
   %ashr = ashr i64 %shl, 32
@@ -138,7 +140,7 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; XGCN: buffer_store_dword
 ; XEG: BFE_INT
 ; XEG: ASHR
-; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 {
+; define amdgpu_kernel void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 {
 ;   %c = add <1 x i64> %a, %b
 ;   %shl = shl <1 x i64> %c, <i64 56>
 ;   %ashr = ashr <1 x i64> %shl, <i64 56>
@@ -150,15 +152,15 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
 ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -177,15 +179,15 @@ define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
 ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -204,15 +206,15 @@ define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
 ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -231,12 +233,12 @@ define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
 
 ; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}}
-define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -262,7 +264,7 @@ define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; EG: LSHL
 ; EG: ASHR [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b
   %x = shl i32 %c, 6
   %y = ashr i32 %x, 7
@@ -285,7 +287,7 @@ define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a,
 ; EG: LSHL
 ; EG: ASHR [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
   %y = ashr <2 x i32> %x, <i32 7, i32 7>
@@ -303,7 +305,7 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 31, i32 31>
   %ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
@@ -324,7 +326,7 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
   %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
@@ -341,7 +343,7 @@ define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 24, i32 24>
   %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
@@ -362,7 +364,7 @@ define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
   %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
@@ -379,7 +381,7 @@ define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 16, i32 16>
   %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
@@ -388,7 +390,7 @@ define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 }
 
 ; FUNC-LABEL: {{^}}testcase:
-define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
+define amdgpu_kernel void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
   %cmp_slt = icmp slt i8 %a, 0
@@ -400,7 +402,7 @@ define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
 }
 
 ; FUNC-LABEL: {{^}}testcase_3:
-define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
+define amdgpu_kernel void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
   %cmp_slt = icmp slt i8 %a, 0
@@ -416,7 +418,7 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
+define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
   %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -429,7 +431,7 @@ define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
-define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
+define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
   %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -444,7 +446,7 @@ define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; GCN: v_max_i32
 ; GCN-NOT: bfe
 ; GCN: buffer_store_short
-define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 {
+define amdgpu_kernel void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 {
   %tmp5 = load i8, i8 addrspace(1)* %src, align 1
   %tmp2 = sext i8 %tmp5 to i32
   %tmp2.5 = icmp sgt i32 %tmp2, 0
@@ -455,167 +457,22 @@ define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 ad
   ret void
 }
 
-declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfe_0_width:
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_8_bfe_8:
-; GCN: v_bfe_i32
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
-  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_8_bfe_16:
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; GCN: s_endpgm
-define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
-  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; This really should be folded into 1
-; FUNC-LABEL: {{^}}bfe_16_bfe_8:
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
-  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Make sure there isn't a redundant BFE
-; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
-; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
-  %c = add i32 %a, %b ; add to prevent folding into extload
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
-define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
-  %c = add i32 %a, %b ; add to prevent folding into extload
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe:
-; GCN: buffer_load_sbyte
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
-  %load = load i8, i8 addrspace(1)* %ptr, align 1
-  %sext = sext i8 %load to i32
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN: .text
-; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
-  %load = load i8, i8 addrspace(1)* %ptr, align 1
-  %sext = sext i8 %load to i32
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
-; GCN-NOT: shr
-; GCN-NOT: shl
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; GCN: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %shr = ashr i32 %shl, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
-; GCN: buffer_load_dword
-; GCN-NOT: shl
-; GCN-NOT: shr
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
-; GCN: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 30
-  %shr = ashr i32 %shl, 30
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
-; GCN: buffer_load_dword
-; GCN-NOT: v_lshl
-; GCN-NOT: v_ashr
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
-; GCN: s_endpgm
-define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 30
-  %shr = ashr i32 %shl, 30
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
 ; Make sure we propagate the VALUness to users of a moved scalar BFE.
 
 ; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64_move_use:
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
 ; GCN-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -636,16 +493,16 @@ define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrsp
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
 
 ; GCN-DAG: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]]
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -668,10 +525,10 @@ define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrs
 ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
   %ld = load i32, i32 addrspace(2)* %ptr
   %in = trunc i32 %ld to i16
   %shl = shl i16 %in, 15
@@ -687,10 +544,10 @@ define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr
 ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
-define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
   %ld = load i32, i32 addrspace(2)* %ptr
   %in = trunc i32 %ld to i16
   %shl = shl i16 %in, 14
@@ -704,7 +561,7 @@ define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[VAL]], 0, 1{{$}}
 
 ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
-define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %gep = getelementptr i16, i16 addrspace(1)* %ptr, i32 %tid
   %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
@@ -721,11 +578,11 @@ define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr
 ; GCN: {{buffer|flat}}_load_ushort [[VAL1:v[0-9]+]]
 
 ; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
-; VI: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
+; GFX89: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
 
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
 ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
-define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
+define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -748,10 +605,10 @@ define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(
 ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
-define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 14
   %sext = ashr i16 %shl, 14
   store i16 %sext, i16 addrspace(1)* %out
@@ -765,10 +622,10 @@ define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
 ; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
-define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 8
   %sext = ashr i16 %shl, 8
   store i16 %sext, i16 addrspace(1)* %out
@@ -782,16 +639,82 @@ define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
 ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
-define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 1
   %sext = ashr i16 %shl, 1
   store i16 %sext, i16 addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i16:
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 15, [[ADD]]
+; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 15, [[SHL]]
+define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+  %c = add <2 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i16> %c, <i16 15, i16 15>
+  %ashr = ashr <2 x i16> %shl, <i16 15, i16 15>
+  store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16:
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
+  %c = add <3 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <3 x i16> %c, <i16 15, i16 15, i16 15>
+  %ashr = ashr <3 x i16> %shl, <i16 15, i16 15, i16 15>
+  store <3 x i16> %ashr, <3 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i2_to_v2i16:
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 14, [[ADD]]
+; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 14, [[SHL]]
+define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+  %c = add <2 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i16> %c, <i16 14, i16 14>
+  %ashr = ashr <2 x i16> %shl, <i16 14, i16 14>
+  store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i16:
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 8, [[ADD]]
+; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 8, [[SHL]]
+define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+  %c = add <2 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i16> %c, <i16 8, i16 8>
+  %ashr = ashr <2 x i16> %shl, <i16 8, i16 8>
+  store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16:
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
+  %c = add <3 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <3 x i16> %c, <i16 8, i16 8, i16 8>
+  %ashr = ashr <3 x i16> %shl, <i16 8, i16 8, i16 8>
+  store <3 x i16> %ashr, <3 x i16> addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.r600.read.tidig.x() #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index d5d2f6b717f9..8e18ab5554e4 100644
--- a/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -13,7 +13,7 @@
 
 ; SI: s_sub
 
-define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -52,7 +52,7 @@ endif:
 ; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]]
 ; SI: buffer_store_dword
 ; SI-NEXT: s_endpgm
-define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -79,7 +79,7 @@ endif:
 ; SI: s_add_i32 [[SGPR:s[0-9]+]]
 ; SI-NOT: s_add_i32 [[SGPR]]
 
-define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid_f = uitofp i32 %tid to float
@@ -116,7 +116,7 @@ endif:
 ; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp eq i32 %tid, 0
diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
index f44ae6e09e9f..fb0bbaa9cbf2 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -6,7 +6,7 @@
 
 ; SI-LABEL: {{^}}test_dup_operands:
 ; SI: v_add_i32_e32
-define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
+define amdgpu_kernel void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %lo = extractelement <2 x i32> %a, i32 0
   %hi = extractelement <2 x i32> %a, i32 1
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
index 013f5253b369..5c20e9a8d585 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -1,13 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
 
-; This test checks that no VGPR to SGPR copies are created by the register
-; allocator.
-
-
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-
 ; CHECK-LABEL: {{^}}phi1:
 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
@@ -29,13 +22,13 @@ ELSE:                                             ; preds = %main_body
 ENDIF:                                            ; preds = %ELSE, %main_body
   %temp.0 = phi float [ %tmp26, %ELSE ], [ %tmp21, %main_body ]
   %tmp27 = fadd float %temp.0, %tmp23
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0
   ret void
 }
 
 ; Make sure this program doesn't crash
 ; CHECK-LABEL: {{^}}phi2:
-define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
   %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -58,28 +51,54 @@ main_body:
   %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0
   %tmp38 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
   %tmp39 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp38, !tbaa !0
-  %tmp40 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5)
-  %tmp41 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5)
-  %tmp42 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg3, <2 x i32> %arg5)
-  %tmp43 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg3, <2 x i32> %arg5)
-  %tmp44 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg3, <2 x i32> %arg5)
-  %tmp45 = bitcast float %tmp40 to i32
-  %tmp46 = bitcast float %tmp41 to i32
+  %i.i = extractelement <2 x i32> %arg5, i32 0
+  %j.i = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #1
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #1
+  %i.i19 = extractelement <2 x i32> %arg5, i32 0
+  %j.i20 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i21 = bitcast i32 %i.i19 to float
+  %j.f.i22 = bitcast i32 %j.i20 to float
+  %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 1, i32 0, i32 %arg3) #1
+  %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 1, i32 0, i32 %arg3) #1
+  %i.i13 = extractelement <2 x i32> %arg5, i32 0
+  %j.i14 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i15 = bitcast i32 %i.i13 to float
+  %j.f.i16 = bitcast i32 %j.i14 to float
+  %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 1, i32 %arg3) #1
+  %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 1, i32 %arg3) #1
+  %i.i7 = extractelement <2 x i32> %arg5, i32 0
+  %j.i8 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i9 = bitcast i32 %i.i7 to float
+  %j.f.i10 = bitcast i32 %j.i8 to float
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 1, i32 %arg3) #1
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 1, i32 %arg3) #1
+  %i.i1 = extractelement <2 x i32> %arg5, i32 0
+  %j.i2 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 1, i32 %arg3) #1
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 1, i32 %arg3) #1
+  %tmp45 = bitcast float %p2.i to i32
+  %tmp46 = bitcast float %p2.i24 to i32
   %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0
   %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1
   %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32>
-  %tmp49 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp48, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp50 = extractelement <4 x float> %tmp49, i32 2
-  %tmp51 = call float @fabs(float %tmp50)
-  %tmp52 = fmul float %tmp42, %tmp42
-  %tmp53 = fmul float %tmp43, %tmp43
+  %a.bc.i = bitcast <2 x i32> %tmp48 to <2 x float>
+  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp50 = extractelement <4 x float> %tmp1, i32 2
+  %tmp51 = call float @llvm.fabs.f32(float %tmp50)
+  %tmp52 = fmul float %p2.i18, %p2.i18
+  %tmp53 = fmul float %p2.i12, %p2.i12
   %tmp54 = fadd float %tmp53, %tmp52
-  %tmp55 = fmul float %tmp44, %tmp44
+  %tmp55 = fmul float %p2.i6, %p2.i6
   %tmp56 = fadd float %tmp54, %tmp55
   %tmp57 = call float @llvm.amdgcn.rsq.f32(float %tmp56)
-  %tmp58 = fmul float %tmp42, %tmp57
-  %tmp59 = fmul float %tmp43, %tmp57
-  %tmp60 = fmul float %tmp44, %tmp57
+  %tmp58 = fmul float %p2.i18, %tmp57
+  %tmp59 = fmul float %p2.i12, %tmp57
+  %tmp60 = fmul float %p2.i6, %tmp57
   %tmp61 = fmul float %tmp58, %tmp22
   %tmp62 = fmul float %tmp59, %tmp23
   %tmp63 = fadd float %tmp62, %tmp61
@@ -90,7 +109,7 @@ main_body:
   %tmp68 = fadd float %tmp67, %tmp66
   %tmp69 = fmul float %tmp26, %tmp68
   %tmp70 = fmul float %tmp27, %tmp68
-  %tmp71 = call float @fabs(float %tmp69)
+  %tmp71 = call float @llvm.fabs.f32(float %tmp69)
   %tmp72 = fcmp olt float 0x3EE4F8B580000000, %tmp71
   %tmp73 = sext i1 %tmp72 to i32
   %tmp74 = bitcast i32 %tmp73 to float
@@ -110,7 +129,7 @@ IF:                                               ; preds = %main_body
 
 ENDIF:                                            ; preds = %IF, %main_body
   %temp4.0 = phi float [ %tmp83, %IF ], [ %tmp31, %main_body ]
-  %tmp84 = call float @fabs(float %tmp70)
+  %tmp84 = call float @llvm.fabs.f32(float %tmp70)
   %tmp85 = fcmp olt float 0x3EE4F8B580000000, %tmp84
   %tmp86 = sext i1 %tmp85 to i32
   %tmp87 = bitcast i32 %tmp86 to float
@@ -146,11 +165,9 @@ ENDIF24:                                          ; preds = %IF25, %ENDIF
   %tmp110 = fmul float %tmp109, %tmp106
   %tmp111 = fsub float -0.000000e+00, %tmp105
   %tmp112 = fmul float %tmp111, %tmp106
-  %tmp113 = call i32 @llvm.SI.packf16(float %tmp108, float %tmp110)
-  %tmp114 = bitcast i32 %tmp113 to float
-  %tmp115 = call i32 @llvm.SI.packf16(float %tmp112, float 1.000000e+00)
-  %tmp116 = bitcast i32 %tmp115 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp114, float %tmp116, float %tmp114, float %tmp116)
+  %tmp113 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp108, float %tmp110)
+  %tmp115 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp112, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp113, <2 x half> %tmp115, i1 true, i1 true) #0
   ret void
 }
 
@@ -183,7 +200,7 @@ LOOP:                                             ; preds = %ENDIF, %main_body
   br i1 %tmp33, label %IF, label %ENDIF
 
 IF:                                               ; preds = %LOOP
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00, i1 true, i1 true) #0
   ret void
 
 ENDIF:                                            ; preds = %LOOP
@@ -193,31 +210,6 @@ ENDIF:                                            ; preds = %LOOP
   br label %LOOP
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: readonly
-declare float @fabs(float) #2
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <8 x i32>, <16 x i8>, i32) #1
-
-; Function Attrs: readnone
-declare float @llvm.amdgcn.rsq.f32(float) #1
-
-declare float @llvm.exp2.f32(float) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.pow.f32(float, float) #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
 ; This checks for a bug in the FixSGPRCopies pass where VReg96
 ; registers were being identified as an SGPR regclass which was causing
 ; an assertion failure.
@@ -248,24 +240,24 @@ entry:
   br i1 %tmp27, label %if, label %else
 
 if:                                               ; preds = %entry
-  %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 11, i32 13>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %val.if.0 = extractelement <4 x float> %val.if, i32 0
-  %val.if.1 = extractelement <4 x float> %val.if, i32 1
-  %val.if.2 = extractelement <4 x float> %val.if, i32 2
+  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0x36D6000000000000, float 0x36DA000000000000>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %val.if.0 = extractelement <4 x float> %tmp1, i32 0
+  %val.if.1 = extractelement <4 x float> %tmp1, i32 1
+  %val.if.2 = extractelement <4 x float> %tmp1, i32 2
   br label %endif
 
 else:                                             ; preds = %entry
-  %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 5, i32 7>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %val.else.0 = extractelement <4 x float> %val.else, i32 0
-  %val.else.1 = extractelement <4 x float> %val.else, i32 1
-  %val.else.2 = extractelement <4 x float> %val.else, i32 2
+  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0x36C4000000000000, float 0x36CC000000000000>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %val.else.0 = extractelement <4 x float> %tmp2, i32 0
+  %val.else.1 = extractelement <4 x float> %tmp2, i32 1
+  %val.else.2 = extractelement <4 x float> %tmp2, i32 2
   br label %endif
 
 endif:                                            ; preds = %else, %if
   %val.0 = phi float [ %val.if.0, %if ], [ %val.else.0, %else ]
   %val.1 = phi float [ %val.if.1, %if ], [ %val.else.1, %else ]
   %val.2 = phi float [ %val.if.2, %if ], [ %val.else.2, %else ]
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %val.0, float %val.1, float %val.2, float 0.000000e+00, i1 true, i1 true) #0
   ret void
 }
 
@@ -273,7 +265,7 @@ endif:                                            ; preds = %else, %if
 ; CHECK: buffer_load_dword
 ; CHECK: v_add
 ; CHECK: s_endpgm
-define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
+define amdgpu_kernel void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
 entry:
   %tmp = load float, float addrspace(1)* %in0
   %tmp1 = fcmp oeq float %tmp, 0.000000e+00
@@ -312,7 +304,7 @@ LOOP68:                                           ; preds = %ENDIF69, %entry
 IF70:                                             ; preds = %LOOP68
   %q = icmp ne i32 %l, 13
   %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0
   ret void
 
 ENDIF69:                                          ; preds = %LOOP68
@@ -337,41 +329,53 @@ ENDIF69:                                          ; preds = %LOOP68
 define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
-  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !2
+  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !3
   %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
   %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
-  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !2
+  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !3
   %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
-  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !2
-  %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
-  %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !3
+  %i.i = extractelement <2 x i32> %arg7, i32 0
+  %j.i = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #0
+  %i.i1 = extractelement <2 x i32> %arg7, i32 0
+  %j.i2 = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #0
   %tmp31 = bitcast float %tmp23 to i32
   %tmp36 = icmp ne i32 %tmp31, 0
   br i1 %tmp36, label %bb38, label %bb80
 
 bb38:                                             ; preds = %bb
-  %tmp52 = bitcast float %tmp29 to i32
-  %tmp53 = bitcast float %tmp30 to i32
+  %tmp52 = bitcast float %p2.i to i32
+  %tmp53 = bitcast float %p2.i6 to i32
   %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0
   %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1
   %tmp56 = bitcast <8 x i32> %tmp26 to <8 x i32>
-  %tmp58 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp55, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %a.bc.i = bitcast <2 x i32> %tmp55 to <2 x float>
+  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   br label %bb71
 
 bb80:                                             ; preds = %bb
-  %tmp81 = bitcast float %tmp29 to i32
-  %tmp82 = bitcast float %tmp30 to i32
+  %tmp81 = bitcast float %p2.i to i32
+  %tmp82 = bitcast float %p2.i6 to i32
   %tmp82.2 = add i32 %tmp82, 1
   %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0
   %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1
   %tmp85 = bitcast <8 x i32> %tmp26 to <8 x i32>
-  %tmp87 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp84, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %a.bc.i1 = bitcast <2 x i32> %tmp84 to <2 x float>
+  %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   br label %bb71
 
 bb71:                                             ; preds = %bb80, %bb38
-  %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ]
+  %tmp72 = phi <4 x float> [ %tmp2, %bb38 ], [ %tmp3, %bb80 ]
   %tmp88 = extractelement <4 x float> %tmp72, i32 0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp88, float %tmp88, float %tmp88, float %tmp88, i1 true, i1 true) #0
   ret void
 }
 
@@ -379,14 +383,14 @@ bb71:                                             ; preds = %bb80, %bb38
 ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+bb:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
   %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
-  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp10 = extractelement <4 x float> %tmp9, i32 0
-  %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10)
-  %tmp13 = bitcast i32 %tmp12 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp10 = extractelement <4 x float> %tmp, i32 0
+  %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0
   ret void
 }
 
@@ -394,24 +398,35 @@ define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg
 ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 {
+bb:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
   %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0
-  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp10 = extractelement <4 x float> %tmp9, i32 0
-  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
-  %tmp13 = bitcast i32 %tmp12 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp10 = extractelement <4 x float> %tmp, i32 0
+  %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0
   ret void
 }
 
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.amdgcn.rsq.f32(float) #1
+declare float @llvm.exp2.f32(float) #1
+declare float @llvm.pow.f32(float, float) #1
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
-!1 = !{!"const", !3}
-!2 = !{!1, !1, i64 0}
-!3 = !{!"tbaa root"}
+!1 = !{!"const", !2}
+!2 = !{!"tbaa root"}
+!3 = !{!1, !1, i64 0}
diff --git a/test/CodeGen/AMDGPU/sgprcopies.ll b/test/CodeGen/AMDGPU/sgprcopies.ll
new file mode 100644
index 000000000000..68cd83bb6cf0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sgprcopies.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}checkTwoBlocksWithUniformBranch
+; GCN: BB0_2
+; GCN: v_add
+define amdgpu_kernel void @checkTwoBlocksWithUniformBranch(i32 addrspace(1)* nocapture %out, i32 %width, float %xPos, float %yPos, float %xStep, float %yStep, i32 %maxIter) {
+entry:
+  %conv = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %rem = urem i32 %conv, %width
+  %div = udiv i32 %conv, %width
+  %conv1 = sitofp i32 %rem to float
+  %x = tail call float @llvm.fmuladd.f32(float %xStep, float %conv1, float %xPos)
+  %conv2 = sitofp i32 %div to float
+  %y = tail call float @llvm.fmuladd.f32(float %yStep, float %conv2, float %yPos)
+  %yy = fmul float %y, %y
+  %xy = tail call float @llvm.fmuladd.f32(float %x, float %x, float %yy)
+  %cmp01 = fcmp ole float %xy, 4.000000e+00
+  %cmp02 = icmp ne i32 %maxIter, 0
+  %cond01 = and i1 %cmp02, %cmp01
+  br i1 %cond01, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %x_val = phi float [ %call8, %for.body ], [ %x, %for.body.preheader ]
+  %iter_val = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %y_val = phi float [ %call9, %for.body ], [ %y, %for.body.preheader ]
+  %sub = fsub float -0.000000e+00, %y_val
+  %call7 = tail call float @llvm.fmuladd.f32(float %x_val, float %x_val, float %x) #1
+  %call8 = tail call float @llvm.fmuladd.f32(float %sub, float %y_val, float %call7) #1
+  %mul = fmul float %x_val, 2.000000e+00
+  %call9 = tail call float @llvm.fmuladd.f32(float %mul, float %y_val, float %y) #1
+  %inc = add nuw i32 %iter_val, 1
+  %mul3 = fmul float %call9, %call9
+  %0 = tail call float @llvm.fmuladd.f32(float %call8, float %call8, float %mul3)
+  %cmp = fcmp ole float %0, 4.000000e+00
+  %cmp5 = icmp ult i32 %inc, %maxIter
+  %or.cond = and i1 %cmp5, %cmp
+  br i1 %or.cond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %iter.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %for.end.loopexit ]
+  %idxprom = ashr exact i32 %conv, 32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idxprom
+  store i32 %iter.0.lcssa, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index 48bbc32abcbb..0a29db4a0580 100644
--- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -11,8 +11,8 @@
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i128, i128 addrspace(1)* %in.gep
@@ -33,7 +33,7 @@ define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -55,8 +55,8 @@ define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i128, i128 addrspace(1)* %in.gep
@@ -77,7 +77,7 @@ define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -100,7 +100,7 @@ define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -113,5 +113,7 @@ define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index b85714ea54c1..6f5fc6d0f38c 100644
--- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -8,8 +8,8 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -25,7 +25,7 @@ define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -41,8 +41,8 @@ define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -57,8 +57,8 @@ define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -73,7 +73,7 @@ define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -89,7 +89,7 @@ define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -105,8 +105,8 @@ define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -121,8 +121,8 @@ define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -137,8 +137,8 @@ define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -155,8 +155,8 @@ define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -171,7 +171,7 @@ define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -188,7 +188,7 @@ define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -204,7 +204,7 @@ define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -220,7 +220,7 @@ define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
 ; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
-define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -236,7 +236,7 @@ define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN: buffer_store_dword v[[SHIFT]]
-define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -252,7 +252,7 @@ define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspa
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
 ; GCN: buffer_store_dword [[BFE]]
-define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -268,7 +268,7 @@ define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspac
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
 ; GCN: buffer_store_dword [[BFE]]
-define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -286,7 +286,7 @@ define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspa
 ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
 ; GCN-NOT: v[[SHRLO]]
 ; GCN: buffer_store_dword v[[SHRLO]]
-define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -306,7 +306,7 @@ define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addr
 ; GCN-NOT: v[[SHRLO]]
 ; GCN-NOT: v[[SHRHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
-define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -327,7 +327,7 @@ define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -347,7 +347,7 @@ define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i6
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}}
-define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -365,7 +365,7 @@ define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i6
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 ; GCN: buffer_store_dword v[[ZERO]]
-define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x
@@ -383,5 +383,7 @@ define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %ou
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 28a7b924904d..a803849be02c 100644
--- a/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 35
   store i64 %shl, i64 addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -32,7 +32,7 @@ define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 33
   store i64 %shl, i64 addrspace(1)* %out
@@ -43,7 +43,7 @@ define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -58,7 +58,7 @@ define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23
 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff
   %shl = lshr i64 %and, 40
@@ -73,7 +73,7 @@ define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 35
   store i64 %shl, i64 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -96,7 +96,7 @@ define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -106,7 +106,7 @@ define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x)
 
 ; GCN-LABEL: {{^}}ashr_i64_const_32:
-define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = ashr i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -114,7 +114,7 @@ define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 }
 
 ; GCN-LABEL: {{^}}ashr_i64_const_63:
-define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = ashr i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -125,7 +125,7 @@ define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 31
   %trunc = trunc i64 %shl to i32
@@ -137,7 +137,7 @@ define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
 ; GCN: buffer_store_short [[SHL]]
-define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 15
   %trunc = trunc i64 %shl to i16
@@ -149,7 +149,7 @@ define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
 ; GCN: buffer_store_short [[SHL]]
-define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in
   %shl = shl i32 %val, 15
   %trunc = trunc i32 %shl to i16
@@ -161,7 +161,7 @@ define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]]
 ; GCN: buffer_store_byte [[SHL]]
-define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 7
   %trunc = trunc i64 %shl to i8
@@ -174,7 +174,7 @@ define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
 ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]]
 ; GCN: buffer_store_byte [[AND]]
-define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 1
   %trunc = trunc i64 %shl to i2
@@ -186,7 +186,7 @@ define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 1
   %trunc = trunc i64 %shl to i32
@@ -198,7 +198,7 @@ define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 16
   %trunc = trunc i64 %shl to i32
@@ -209,7 +209,7 @@ define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}trunc_shl_33_i32_i64:
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[ZERO]]
-define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 33
   %trunc = trunc i64 %shl to i32
@@ -222,7 +222,7 @@ define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESHI:[0-9]+]], 16, v{{[0-9]+}}
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %shl = shl <2 x i64> %val, <i64 16, i64 16>
   %trunc = trunc <2 x i64> %shl to <2 x i32>
@@ -235,7 +235,7 @@ define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> ad
 ; GCN: v_lshl_b64 v{{\[}}[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]{{\]}}, [[VAL]], 31
 ; GCN: buffer_store_dword v[[RESLO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-define void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 31
   %trunc = trunc i64 %shl to i32
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index 972349c24453..ff666cc3653b 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
+declare i32 @llvm.r600.read.tgid.x() #0
+
 
 ;EG: {{^}}shl_v2i32:
 ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -17,7 +19,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -44,7 +46,7 @@ define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -57,7 +59,7 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %a = load i16, i16 addrspace(1)* %in
   %b = load i16, i16 addrspace(1)* %b_ptr
@@ -70,7 +72,7 @@ define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
   %a = load i16, i16 addrspace(1)* %in
   %result = shl i16 %a, %b
   store i16 %result, i16 addrspace(1)* %out
@@ -81,7 +83,7 @@ define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b)
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
   %a = load i16, i16 addrspace(1)* %in
   %b.add = add i16 %b, 3
   %result = shl i16 %a, %b.add
@@ -92,7 +94,7 @@ define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in,
 ; GCN-LABEL: {{^}}shl_i16_computed_amount:
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 3, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, [[ADD]], v{{[0-9]+}}
-define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
@@ -107,7 +109,7 @@ define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %
 
 ; GCN-LABEL: {{^}}shl_i16_i_s:
 ; GCN: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 12
-define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
+define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
   %result = shl i16 %a, 12
   store i16 %result, i16 addrspace(1)* %out
   ret void
@@ -116,7 +118,7 @@ define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
 ; GCN-LABEL: {{^}}shl_v2i16:
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -133,7 +135,7 @@ define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
@@ -160,7 +162,7 @@ define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in
 ; GCN-LABEL: {{^}}shl_i64:
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 ; VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -199,7 +201,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
-define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -262,7 +264,7 @@ define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
-define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -277,7 +279,7 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = shl i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -287,8 +289,8 @@ define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
 ; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
-define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tgid.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
   %a = load i64, i64 addrspace(1)* %gep.in
@@ -299,7 +301,7 @@ define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}s_shl_constant_i64
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %shl = shl i64 281474976710655, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -311,7 +313,7 @@ define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI-DAG: s_movk_i32 s[[KHI:[0-9]+]], 0x11e{{$}}
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
 ; SI: buffer_store_dwordx2
-define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 1231231234567, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -323,7 +325,7 @@ define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
 ; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x12d687{{$}}
 ; SI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0{{$}}
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
-define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 1234567, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -332,7 +334,7 @@ define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}v_shl_inline_imm_64_i64:
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, 64, {{v[0-9]+}}
-define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 64, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -341,7 +343,7 @@ define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_64_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 64, s{{[0-9]+}}
-define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 64, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -349,7 +351,7 @@ define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_1_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1, s{{[0-9]+}}
-define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 1, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -357,7 +359,7 @@ define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_1.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4607182418800017408, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -365,7 +367,7 @@ define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_1.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13830554455654793216, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -373,7 +375,7 @@ define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_0.5_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0.5, s{{[0-9]+}}
-define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4602678819172646912, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -381,7 +383,7 @@ define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_0.5_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -0.5, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13826050856027422720, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -389,7 +391,7 @@ define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_2.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 2.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4611686018427387904, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -397,7 +399,7 @@ define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_2.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -2.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13835058055282163712, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -405,7 +407,7 @@ define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_4.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 4.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4616189618054758400, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -413,7 +415,7 @@ define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_4.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -4.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13839561654909534208, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -427,7 +429,7 @@ define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 1082130432, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -439,7 +441,7 @@ define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 -1065353216, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -450,7 +452,7 @@ define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4647714815446351872, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -460,10 +462,18 @@ define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrs
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13871086852301127680, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_mul2:
+; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+define amdgpu_kernel void @test_mul2(i32 %p) {
+   %i = mul i32 %p, 2
+   store volatile i32 %i, i32 addrspace(1)* undef
+   ret void
+}
+
 attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shl.v2i16.ll b/test/CodeGen/AMDGPU/shl.v2i16.ll
new file mode 100644
index 000000000000..eac29bad7cf2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -0,0 +1,152 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+
+; GCN-LABEL: {{^}}s_shl_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
+
+; VI: v_lshlrev_b32_e32
+; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+; CI: v_lshlrev_b32_e32
+; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_or_b32_e32
+define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+  %result = shl <2 x i16> %lhs, %rhs
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_shl_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+
+; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_lshlrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]]
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
+  %result = shl <2 x i16> %a, %b
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_v_s_v2i16:
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = shl <2 x i16> %vgpr, %sgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_s_v_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = shl <2 x i16> %sgpr, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_imm_v_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8
+define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_v_imm_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]]
+define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_shl_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
+  %result = shl <4 x i16> %a, %b
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_v_imm_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll
index 9b5f9fed4d79..9da4bc028016 100644
--- a/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 36, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -25,7 +25,7 @@ define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; SI-DAG: buffer_store_dword [[ADDREG]]
 ; SI-DAG: buffer_store_dword [[SHLREG]]
 ; SI: s_endpgm
-define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -43,7 +43,7 @@ define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1
 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xf9c, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -61,7 +61,7 @@ define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0
 ; SI: s_addk_i32 [[RESULT]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; SI: buffer_store_dword [[VRESULT]]
-define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %shl, %y
@@ -78,7 +78,7 @@ define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]]
 ; SI: buffer_store_dword [[VRESULT]]
 
-define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %y, %shl
diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 6e45759fa058..9147eb58c6ad 100644
--- a/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -19,7 +19,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
 ; SI: s_endpgm
-define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -39,7 +39,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad
 ; SI-DAG: buffer_store_dword [[RESULT]]
 ; SI-DAG: buffer_store_dword [[ADDUSE]]
 ; SI: s_endpgm
-define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -55,7 +55,7 @@ define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %ad
 ; SI-LABEL: {{^}}load_shl_base_lds_max_offset
 ; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
 ; SI: s_endpgm
-define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 65535
   %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
@@ -73,7 +73,7 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; SI: s_mov_b32 m0, -1
 ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
 ; SI: s_endpgm
-define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -89,7 +89,7 @@ define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -104,7 +104,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 
 @lds2 = addrspace(3) global [512 x i32] undef, align 4
 
-; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+; define amdgpu_kernel void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -119,7 +119,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
+define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -134,7 +134,7 @@ define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -148,7 +148,7 @@ define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -162,7 +162,7 @@ define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -176,7 +176,7 @@ define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -190,7 +190,7 @@ define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -204,7 +204,7 @@ define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -214,7 +214,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
   ret void
 }
 
-; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+; define amdgpu_kernel void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -228,7 +228,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -242,7 +242,7 @@ define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -256,7 +256,7 @@ define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -270,7 +270,7 @@ define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
diff --git a/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
new file mode 100644
index 000000000000..14ca635c6dad
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -0,0 +1,186 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; Test that add/sub with a constant is swapped to sub/add with negated
+; constant to minimize code size.
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_64:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, 64
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_64_multi_use:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[Y:v[0-9]+]]
+; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
+define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load volatile i32, i32 addrspace(1)* %gep
+  %y = load volatile i32, i32 addrspace(1)* %gep
+  %result0 = sub i32 %x, 64
+  %result1 = sub i32 %y, 64
+  store volatile i32 %result0, i32 addrspace(1)* %gep.out
+  store volatile i32 %result1, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_64_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 64, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_65:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xffffffbf, [[X]]
+define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, 65
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_65_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x41, [[X]]
+define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 65, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_neg16:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 16, [[X]]
+define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, -16
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_neg16_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, -16, [[X]]
+define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 -16, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_neg17:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 17, [[X]]
+define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, -17
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_neg17_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0xffffffef, [[X]]
+define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 -17, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_i32_x_sub_64:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: s_sub_i32 s{{[0-9]+}}, [[X]], 64
+define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
+  %result = sub i32 %x, 64
+  call void asm sideeffect "; use $0", "s"(i32 %result)
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i16_x_sub_64:
+; VI: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
+; VI: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
+define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
+  %x = load i16, i16 addrspace(1)* %gep
+  %result = sub i16 %x, 64
+  store i16 %result, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i16_x_sub_64_multi_use:
+; GCN: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[Y:v[0-9]+]]
+; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
+; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[Y]]
+
+; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
+define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
+  %x = load volatile i16, i16 addrspace(1)* %gep
+  %y = load volatile i16, i16 addrspace(1)* %gep
+  %result0 = sub i16 %x, 64
+  %result1 = sub i16 %y, 64
+  store volatile i16 %result0, i16 addrspace(1)* %gep.out
+  store volatile i16 %result1, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
index 1988a14b5845..6248d8a46daf 100644
--- a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
+++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -7,7 +7,7 @@
 # resume crashes
 
 --- |
-  define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -20,7 +20,7 @@
     ret void
   }
 
-  define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -33,7 +33,7 @@
     ret void
   }
 
-  define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -46,7 +46,7 @@
     ret void
   }
 
-  define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -59,7 +59,7 @@
     ret void
   }
 
-  define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -72,7 +72,7 @@
     ret void
   }
 
-  define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
index ef616eb63801..5c6663dbbdab 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -6,10 +6,10 @@
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_unreachable_noloop:
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
-define void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -37,12 +37,49 @@ bb5:                                              ; preds = %bb3, %bb1
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_ret_noloop:
+; GCN: load_dwordx4
+; GCN: v_cmp_nlt_f32
+; GCN: s_and_saveexec_b64
+; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: [[UNIFIED_RET]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN: .Lfunc_end
+define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = sext i32 %tmp to i64
+  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  %tmp5 = extractelement <4 x float> %tmp4, i32 1
+  store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef
+  %cmp = fcmp ogt float %tmp5, 1.0
+  br i1 %cmp, label %bb5, label %bb3
+
+bb3:                                              ; preds = %bb1
+  %tmp6 = extractelement <4 x float> %tmp4, i32 2
+  %tmp7 = fcmp olt float %tmp6, 0.000000e+00
+  br i1 %tmp7, label %bb4, label %bb5 ; crash goes away if these are swapped
+
+bb4:                                              ; preds = %bb3
+  ret void
+
+bb5:                                              ; preds = %bb3, %bb1
+  ret void
+}
+
+; OPT-LABEL: @uniform_annotate_ret_noloop(
+; OPT-NOT: call i1 @llvm.amdgcn.loop
+
+; GCN-LABEL: {{^}}uniform_annotate_ret_noloop:
 ; GCN: s_cbranch_scc1
 ; GCN: s_endpgm
-; GCN: .Lfunc_end1
-define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+; GCN: .Lfunc_end
+define amdgpu_kernel void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 {
 bb:
-  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
 
 bb1:                                              ; preds = %bb
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll b/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
new file mode 100644
index 000000000000..e50c595bc6c3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
@@ -0,0 +1,40 @@
+; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+
+; OPT-LABEL: @annotate_unreachable(
+; OPT: call { i1, i64 } @llvm.amdgcn.if(
+; OPT-NOT: call void @llvm.amdgcn.end.cf(
+
+
+; GCN-LABEL: {{^}}annotate_unreachable:
+; GCN: s_and_saveexec_b64
+; GCN-NOT: s_endpgm
+; GCN: .Lfunc_end0
+define amdgpu_kernel void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = sext i32 %tmp to i64
+  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  br i1 undef, label %bb3, label %bb5  ; label order reversed
+
+bb3:                                              ; preds = %bb1
+  %tmp6 = extractelement <4 x float> %tmp4, i32 2
+  %tmp7 = fcmp olt float %tmp6, 0.000000e+00
+  br i1 %tmp7, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb3
+  unreachable
+
+bb5:                                              ; preds = %bb3, %bb1
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll
index d658b229fd37..a4b6d1fd069d 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -10,7 +10,7 @@
 ; SI: s_andn2_b64
 ; s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
 main_body:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = and i32 %a, %tid
@@ -40,7 +40,7 @@ ENDIF:
 ; SI: s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
 
-define void @phi_cond_outside_loop(i32 %b) {
+define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = icmp eq i32 %tid , 0
@@ -68,7 +68,7 @@ exit:
 ; CHECK-LABEL: {{^}}switch_unreachable:
 ; CHECK-NOT: s_endpgm
 ; CHECK: .Lfunc_end2
-define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
 centry:
   switch i32 %x, label %sw.default [
     i32 0, label %sw.bb
@@ -100,7 +100,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 
 ; SI: [[ENDPGM]]:
 ; SI: s_endpgm
-define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
+define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
 entry:
   %cmp = icmp sgt i32 %c0, 0
   br label %while.cond.outer
diff --git a/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
index 025a3d8fca2e..b0473f3b5bda 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -6,7 +6,7 @@
 ; CHECK s_or_b64 exec, exec
 ; CHECK s_andn2_b64 exec, exec
 ; CHECK s_cbranch_execnz
-define void @test(i32 %arg, i32 %arg1) {
+define amdgpu_kernel void @test(i32 %arg, i32 %arg1) {
 bb:
   %tmp = icmp ne i32 %arg, 0
   %tmp7 = icmp ne i32 %arg1, 0
diff --git a/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
index 0c08deb13a8e..20052e865a54 100644
--- a/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
+++ b/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies %s -o - | FileCheck %s -check-prefixes=GCN
 
 --- |
-  define void @phi_visit_order() { ret void }
+  define amdgpu_kernel void @phi_visit_order() { ret void }
 
 name: phi_visit_order
 tracksRegLiveness: true
diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index 0d1de6662f25..580268deb85d 100644
--- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -5,7 +5,7 @@
 
 ; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec
 
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load volatile i32, i32 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/si-literal-folding.ll b/test/CodeGen/AMDGPU/si-literal-folding.ll
deleted file mode 100644
index b3f000c8ccd2..000000000000
--- a/test/CodeGen/AMDGPU/si-literal-folding.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}main:
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}}
-define amdgpu_vs void @main(float) {
-main_body:
-  %1 = fmul float %0, 0x3FE86A7F00000000
-  %2 = fmul float %0, 0xBFE86A7F00000000
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %1, float %1, float %2, float %2)
-  ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll
index 8e846d7a238e..3a7359ea4ffa 100644
--- a/test/CodeGen/AMDGPU/si-lod-bias.ll
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@@ -1,12 +1,12 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; This shader has the potential to generated illegal VGPR to SGPR copies if
 ; the wrong register class is used for the REG_SEQUENCE instructions.
 
-; CHECK: {{^}}main:
-; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
-define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) {
+; GCN-LABEL: {{^}}main:
+; GCN: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
+define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
   %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -15,38 +15,45 @@ main_body:
   %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0
   %tmp24 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
   %tmp25 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp24, !tbaa !0
-  %tmp26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5)
-  %tmp27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5)
+  %i.i = extractelement <2 x i32> %arg5, i32 0
+  %j.i = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #0
+  %i.i1 = extractelement <2 x i32> %arg5, i32 0
+  %j.i2 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg3) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg3) #0
   %tmp28 = bitcast float %tmp21 to i32
-  %tmp29 = bitcast float %tmp26 to i32
-  %tmp30 = bitcast float %tmp27 to i32
+  %tmp29 = bitcast float %p2.i to i32
+  %tmp30 = bitcast float %p2.i6 to i32
   %tmp31 = insertelement <4 x i32> undef, i32 %tmp28, i32 0
   %tmp32 = insertelement <4 x i32> %tmp31, i32 %tmp29, i32 1
   %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2
   %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3
   %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32>
-  %tmp35 = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> %tmp34, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp34.bc = bitcast <4 x i32> %tmp34 to <4 x float>
+  %tmp35 = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> %tmp34.bc, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp36 = extractelement <4 x float> %tmp35, i32 0
   %tmp37 = extractelement <4 x float> %tmp35, i32 1
   %tmp38 = extractelement <4 x float> %tmp35, i32 2
   %tmp39 = extractelement <4 x float> %tmp35, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp36, float %tmp37, float %tmp38, float %tmp39)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp36, float %tmp37, float %tmp38, float %tmp39, i1 true, i1 true) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 8d66df258e43..cb010cf15300 100644
--- a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -4,17 +4,18 @@
 ; GCN: v_cmp_eq_u32
 ; GCN: s_and_saveexec_b64
 ; GCN: s_xor_b64
-; GCN: ; mask branch [[RET:BB[0-9]+]]
-; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 
-; GCN: [[RET]]
-; GCN: s_or_b64 exec, exec
-; GCN: s_endpgm
-
-; GCN: [[UNREACHABLE]]:
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
+; GCN: ; divergent unreachable
 ; GCN: s_waitcnt
-define void @lower_control_flow_unreachable_terminator() #0 {
+
+; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: s_endpgm
+
+define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp63 = icmp eq i32 %tmp15, 32
@@ -29,19 +30,20 @@ ret:
 }
 
 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
-; GCN: v_cmp_eq_u32
+; GCN: v_cmp_ne_u32
 ; GCN: s_and_saveexec_b64
 ; GCN: s_xor_b64
-; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
 
-; GCN-NEXT: ; %ret
-; GCN-NEXT: s_endpgm
-
-; GCN-NEXT: [[UNREACHABLE]]:
-; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
+; GCN: ; divergent unreachable
 ; GCN: s_waitcnt
-define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
+
+; GCN: [[RETURN]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp63 = icmp eq i32 %tmp15, 32
@@ -55,7 +57,29 @@ unreachable:
   unreachable
 }
 
-; Function Attrs: nounwind readnone
+; GCN-LABEL: {{^}}uniform_lower_control_flow_unreachable_terminator:
+; GCN: s_cmp_lg_u32
+; GCN: s_cbranch_scc0 [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB#{{[0-9]+}}: ; %ret
+; GCN-NEXT: s_endpgm
+
+; GCN: [[UNREACHABLE]]:
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
+bb:
+  %tmp63 = icmp eq i32 %arg0, 32
+  br i1 %tmp63, label %unreachable, label %ret
+
+unreachable:
+  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  unreachable
+
+ret:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.y() #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/si-scheduler.ll b/test/CodeGen/AMDGPU/si-scheduler.ll
index 9374ef3cd907..462528c4ff1a 100644
--- a/test/CodeGen/AMDGPU/si-scheduler.ll
+++ b/test/CodeGen/AMDGPU/si-scheduler.ll
@@ -3,7 +3,7 @@
 ; The only way the subtarget knows that the si machine scheduler is being used
 ; is to specify -mattr=si-scheduler.  If we just pass --misched=si, the backend
 ; won't know what scheduler we are using.
-; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s | FileCheck %s
+; RUN: llc -march=amdgcn --misched=si -mattr=si-scheduler < %s | FileCheck %s
 
 ; The test checks the "si" machine scheduler pass works correctly.
 
@@ -22,39 +22,46 @@ main_body:
   %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0
   %tmp23 = bitcast [17 x <4 x i32>] addrspace(2)* %arg2 to <16 x i8> addrspace(2)*
   %tmp24 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp23, align 16, !tbaa !0
-  %tmp25 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg11)
-  %tmp26 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg11)
-  %tmp27 = bitcast float %tmp25 to i32
-  %tmp28 = bitcast float %tmp26 to i32
+  %i.i = extractelement <2 x i32> %arg11, i32 0
+  %j.i = extractelement <2 x i32> %arg11, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #1
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #1
+  %i.i1 = extractelement <2 x i32> %arg11, i32 0
+  %j.i2 = extractelement <2 x i32> %arg11, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #1
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #1
+  %tmp27 = bitcast float %p2.i to i32
+  %tmp28 = bitcast float %p2.i6 to i32
   %tmp29 = insertelement <2 x i32> undef, i32 %tmp27, i32 0
   %tmp30 = insertelement <2 x i32> %tmp29, i32 %tmp28, i32 1
   %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32>
   %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32>
-  %tmp31 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp30, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp30.bc = bitcast <2 x i32> %tmp30 to <2 x float>
+  %tmp31 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp30.bc, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+
   %tmp32 = extractelement <4 x float> %tmp31, i32 0
   %tmp33 = extractelement <4 x float> %tmp31, i32 1
   %tmp34 = extractelement <4 x float> %tmp31, i32 2
   %tmp35 = extractelement <4 x float> %tmp31, i32 3
-  %tmp36 = call i32 @llvm.SI.packf16(float %tmp32, float %tmp33)
-  %tmp37 = bitcast i32 %tmp36 to float
-  %tmp38 = call i32 @llvm.SI.packf16(float %tmp34, float %tmp35)
-  %tmp39 = bitcast i32 %tmp38 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp37, float %tmp39, float %tmp37, float %tmp39)
+  %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp32, float %tmp33)
+  %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp34, float %tmp35)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 false) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index e61b4051124a..8731e74d63a0 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,27 +1,29 @@
-; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling,-mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
 
-
 @ddxy_lds = external addrspace(3) global [64 x i32]
 
-; CHECK-LABEL: {{^}}main:
-; CHECK: s_wqm
+; GCN-LABEL: {{^}}main:
+; GCN: s_wqm
 
 ; Make sure not emitting unused scratch resource descriptor setup
-; CHECK-NOT: s_mov_b32
-; CHECK-NOT: s_mov_b32
-; CHECK-NOT: s_mov_b32
-; CHECK-NOT: s_mov_b32
+; GCN-NOT: s_mov_b32
+; GCN-NOT: s_mov_b32
+; GCN-NOT: s_mov_b32
+; GCN-NOT: s_mov_b32
 
-; CHECK: s_mov_b32 m0
+; GCN: s_mov_b32 m0
 
+; Make sure scratch space isn't being used for SGPR->VGPR spills
 
 ; Writing to M0 from an SMRD instruction will hang the GPU.
-; CHECK-NOT: s_buffer_load_dword m0
-; CHECK: s_endpgm
+; GCN-NOT: s_buffer_load_dword m0
+; GCN: s_endpgm
+
+; TOVGPR: ScratchSize: 0{{$}}
 define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
@@ -97,29 +99,114 @@ main_body:
   %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0
   %tmp90 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7
   %tmp91 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp90, !tbaa !0
-  %tmp92 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp93 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp94 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp95 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp96 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp97 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp98 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp99 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp100 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp101 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp102 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp103 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp104 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp105 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp106 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp107 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp108 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6)
+  %i.i = extractelement <2 x i32> %arg6, i32 0
+  %j.i = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg4) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg4) #0
+  %i.i91 = extractelement <2 x i32> %arg6, i32 0
+  %j.i92 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i93 = bitcast i32 %i.i91 to float
+  %j.f.i94 = bitcast i32 %j.i92 to float
+  %p1.i95 = call float @llvm.amdgcn.interp.p1(float %i.f.i93, i32 1, i32 0, i32 %arg4) #0
+  %p2.i96 = call float @llvm.amdgcn.interp.p2(float %p1.i95, float %j.f.i94, i32 1, i32 0, i32 %arg4) #0
+  %i.i85 = extractelement <2 x i32> %arg6, i32 0
+  %j.i86 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i87 = bitcast i32 %i.i85 to float
+  %j.f.i88 = bitcast i32 %j.i86 to float
+  %p1.i89 = call float @llvm.amdgcn.interp.p1(float %i.f.i87, i32 0, i32 1, i32 %arg4) #0
+  %p2.i90 = call float @llvm.amdgcn.interp.p2(float %p1.i89, float %j.f.i88, i32 0, i32 1, i32 %arg4) #0
+  %i.i79 = extractelement <2 x i32> %arg6, i32 0
+  %j.i80 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i81 = bitcast i32 %i.i79 to float
+  %j.f.i82 = bitcast i32 %j.i80 to float
+  %p1.i83 = call float @llvm.amdgcn.interp.p1(float %i.f.i81, i32 1, i32 1, i32 %arg4) #0
+  %p2.i84 = call float @llvm.amdgcn.interp.p2(float %p1.i83, float %j.f.i82, i32 1, i32 1, i32 %arg4) #0
+  %i.i73 = extractelement <2 x i32> %arg6, i32 0
+  %j.i74 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i75 = bitcast i32 %i.i73 to float
+  %j.f.i76 = bitcast i32 %j.i74 to float
+  %p1.i77 = call float @llvm.amdgcn.interp.p1(float %i.f.i75, i32 2, i32 1, i32 %arg4) #0
+  %p2.i78 = call float @llvm.amdgcn.interp.p2(float %p1.i77, float %j.f.i76, i32 2, i32 1, i32 %arg4) #0
+  %i.i67 = extractelement <2 x i32> %arg6, i32 0
+  %j.i68 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i69 = bitcast i32 %i.i67 to float
+  %j.f.i70 = bitcast i32 %j.i68 to float
+  %p1.i71 = call float @llvm.amdgcn.interp.p1(float %i.f.i69, i32 0, i32 2, i32 %arg4) #0
+  %p2.i72 = call float @llvm.amdgcn.interp.p2(float %p1.i71, float %j.f.i70, i32 0, i32 2, i32 %arg4) #0
+  %i.i61 = extractelement <2 x i32> %arg6, i32 0
+  %j.i62 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i63 = bitcast i32 %i.i61 to float
+  %j.f.i64 = bitcast i32 %j.i62 to float
+  %p1.i65 = call float @llvm.amdgcn.interp.p1(float %i.f.i63, i32 1, i32 2, i32 %arg4) #0
+  %p2.i66 = call float @llvm.amdgcn.interp.p2(float %p1.i65, float %j.f.i64, i32 1, i32 2, i32 %arg4) #0
+  %i.i55 = extractelement <2 x i32> %arg6, i32 0
+  %j.i56 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i57 = bitcast i32 %i.i55 to float
+  %j.f.i58 = bitcast i32 %j.i56 to float
+  %p1.i59 = call float @llvm.amdgcn.interp.p1(float %i.f.i57, i32 2, i32 2, i32 %arg4) #0
+  %p2.i60 = call float @llvm.amdgcn.interp.p2(float %p1.i59, float %j.f.i58, i32 2, i32 2, i32 %arg4) #0
+  %i.i49 = extractelement <2 x i32> %arg6, i32 0
+  %j.i50 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i51 = bitcast i32 %i.i49 to float
+  %j.f.i52 = bitcast i32 %j.i50 to float
+  %p1.i53 = call float @llvm.amdgcn.interp.p1(float %i.f.i51, i32 0, i32 3, i32 %arg4) #0
+  %p2.i54 = call float @llvm.amdgcn.interp.p2(float %p1.i53, float %j.f.i52, i32 0, i32 3, i32 %arg4) #0
+  %i.i43 = extractelement <2 x i32> %arg6, i32 0
+  %j.i44 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i45 = bitcast i32 %i.i43 to float
+  %j.f.i46 = bitcast i32 %j.i44 to float
+  %p1.i47 = call float @llvm.amdgcn.interp.p1(float %i.f.i45, i32 1, i32 3, i32 %arg4) #0
+  %p2.i48 = call float @llvm.amdgcn.interp.p2(float %p1.i47, float %j.f.i46, i32 1, i32 3, i32 %arg4) #0
+  %i.i37 = extractelement <2 x i32> %arg6, i32 0
+  %j.i38 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i39 = bitcast i32 %i.i37 to float
+  %j.f.i40 = bitcast i32 %j.i38 to float
+  %p1.i41 = call float @llvm.amdgcn.interp.p1(float %i.f.i39, i32 2, i32 3, i32 %arg4) #0
+  %p2.i42 = call float @llvm.amdgcn.interp.p2(float %p1.i41, float %j.f.i40, i32 2, i32 3, i32 %arg4) #0
+  %i.i31 = extractelement <2 x i32> %arg6, i32 0
+  %j.i32 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i33 = bitcast i32 %i.i31 to float
+  %j.f.i34 = bitcast i32 %j.i32 to float
+  %p1.i35 = call float @llvm.amdgcn.interp.p1(float %i.f.i33, i32 0, i32 4, i32 %arg4) #0
+  %p2.i36 = call float @llvm.amdgcn.interp.p2(float %p1.i35, float %j.f.i34, i32 0, i32 4, i32 %arg4) #0
+  %i.i25 = extractelement <2 x i32> %arg6, i32 0
+  %j.i26 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i27 = bitcast i32 %i.i25 to float
+  %j.f.i28 = bitcast i32 %j.i26 to float
+  %p1.i29 = call float @llvm.amdgcn.interp.p1(float %i.f.i27, i32 1, i32 4, i32 %arg4) #0
+  %p2.i30 = call float @llvm.amdgcn.interp.p2(float %p1.i29, float %j.f.i28, i32 1, i32 4, i32 %arg4) #0
+  %i.i19 = extractelement <2 x i32> %arg6, i32 0
+  %j.i20 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i21 = bitcast i32 %i.i19 to float
+  %j.f.i22 = bitcast i32 %j.i20 to float
+  %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 2, i32 4, i32 %arg4) #0
+  %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 2, i32 4, i32 %arg4) #0
+  %i.i13 = extractelement <2 x i32> %arg6, i32 0
+  %j.i14 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i15 = bitcast i32 %i.i13 to float
+  %j.f.i16 = bitcast i32 %j.i14 to float
+  %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 5, i32 %arg4) #0
+  %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 5, i32 %arg4) #0
+  %i.i7 = extractelement <2 x i32> %arg6, i32 0
+  %j.i8 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i9 = bitcast i32 %i.i7 to float
+  %j.f.i10 = bitcast i32 %j.i8 to float
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 5, i32 %arg4) #0
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 5, i32 %arg4) #0
+  %i.i1 = extractelement <2 x i32> %arg6, i32 0
+  %j.i2 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 5, i32 %arg4) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0
   %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
   %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109
-  %tmp111 = bitcast float %tmp92 to i32
+  %tmp111 = bitcast float %p2.i to i32
   store i32 %tmp111, i32 addrspace(3)* %tmp110
-  %tmp112 = bitcast float %tmp93 to i32
+  %tmp112 = bitcast float %p2.i96 to i32
   store i32 %tmp112, i32 addrspace(3)* %tmp110
   %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
@@ -128,14 +215,14 @@ main_body:
   %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115
   %tmp117 = add i32 %tmp115, 1
   %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117
-  %tmp119 = bitcast float %tmp92 to i32
+  %tmp119 = bitcast float %p2.i to i32
   store i32 %tmp119, i32 addrspace(3)* %tmp114
   %tmp120 = load i32, i32 addrspace(3)* %tmp116
   %tmp121 = bitcast i32 %tmp120 to float
   %tmp122 = load i32, i32 addrspace(3)* %tmp118
   %tmp123 = bitcast i32 %tmp122 to float
   %tmp124 = fsub float %tmp123, %tmp121
-  %tmp125 = bitcast float %tmp93 to i32
+  %tmp125 = bitcast float %p2.i96 to i32
   store i32 %tmp125, i32 addrspace(3)* %tmp114
   %tmp126 = load i32, i32 addrspace(3)* %tmp116
   %tmp127 = bitcast i32 %tmp126 to float
@@ -148,10 +235,10 @@ main_body:
   %tmp134 = insertelement <4 x float> %tmp133, float %tmp130, i32 3
   %tmp135 = extractelement <4 x float> %tmp134, i32 0
   %tmp136 = extractelement <4 x float> %tmp134, i32 1
-  %tmp137 = fmul float %tmp59, %tmp92
-  %tmp138 = fmul float %tmp59, %tmp93
-  %tmp139 = fmul float %tmp59, %tmp93
-  %tmp140 = fmul float %tmp59, %tmp93
+  %tmp137 = fmul float %tmp59, %p2.i
+  %tmp138 = fmul float %tmp59, %p2.i96
+  %tmp139 = fmul float %tmp59, %p2.i96
+  %tmp140 = fmul float %tmp59, %p2.i96
   %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
   %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141
@@ -204,26 +291,26 @@ main_body:
   %tmp180 = insertelement <4 x float> %tmp179, float %tmp176, i32 3
   %tmp181 = extractelement <4 x float> %tmp180, i32 0
   %tmp182 = extractelement <4 x float> %tmp180, i32 1
-  %tmp183 = fdiv float 1.000000e+00, %tmp96
+  %tmp183 = fdiv float 1.000000e+00, %p2.i78
   %tmp184 = fmul float %tmp32, %tmp183
   %tmp185 = fcmp uge float 1.000000e+00, %tmp184
   %tmp186 = select i1 %tmp185, float %tmp184, float 1.000000e+00
   %tmp187 = fmul float %tmp186, %tmp29
-  %tmp188 = call float @ceil(float %tmp187)
+  %tmp188 = call float @llvm.ceil.f32(float %tmp187)
   %tmp189 = fcmp uge float 3.000000e+00, %tmp188
   %tmp190 = select i1 %tmp189, float 3.000000e+00, float %tmp188
   %tmp191 = fdiv float 1.000000e+00, %tmp190
   %tmp192 = fdiv float 1.000000e+00, %tmp29
   %tmp193 = fmul float %tmp190, %tmp192
   %tmp194 = fmul float %tmp30, %tmp193
-  %tmp195 = fmul float %tmp94, %tmp94
-  %tmp196 = fmul float %tmp95, %tmp95
+  %tmp195 = fmul float %p2.i90, %p2.i90
+  %tmp196 = fmul float %p2.i84, %p2.i84
   %tmp197 = fadd float %tmp196, %tmp195
-  %tmp198 = fmul float %tmp96, %tmp96
+  %tmp198 = fmul float %p2.i78, %p2.i78
   %tmp199 = fadd float %tmp197, %tmp198
   %tmp200 = call float @llvm.amdgcn.rsq.f32(float %tmp199)
-  %tmp201 = fmul float %tmp94, %tmp200
-  %tmp202 = fmul float %tmp95, %tmp200
+  %tmp201 = fmul float %p2.i90, %tmp200
+  %tmp202 = fmul float %p2.i84, %tmp200
   %tmp203 = fmul float %tmp201, %tmp28
   %tmp204 = fmul float %tmp202, %tmp28
   %tmp205 = fmul float %tmp203, -1.000000e+00
@@ -231,9 +318,9 @@ main_body:
   %tmp207 = fmul float %tmp205, %tmp31
   %tmp208 = fmul float %tmp206, %tmp31
   %tmp209 = fsub float -0.000000e+00, %tmp207
-  %tmp210 = fadd float %tmp92, %tmp209
+  %tmp210 = fadd float %p2.i, %tmp209
   %tmp211 = fsub float -0.000000e+00, %tmp208
-  %tmp212 = fadd float %tmp93, %tmp211
+  %tmp212 = fadd float %p2.i96, %tmp211
   %tmp213 = fmul float %tmp205, %tmp191
   %tmp214 = fmul float %tmp206, %tmp191
   %tmp215 = fmul float -1.000000e+00, %tmp191
@@ -277,7 +364,8 @@ ENDIF:                                            ; preds = %LOOP
   %tmp240 = insertelement <8 x i32> %tmp239, i32 %tmp238, i32 5
   %tmp241 = insertelement <8 x i32> %tmp240, i32 undef, i32 6
   %tmp242 = insertelement <8 x i32> %tmp241, i32 undef, i32 7
-  %tmp243 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp242, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp242.bc = bitcast <8 x i32> %tmp242 to <8 x float>
+  %tmp243 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp242.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp244 = extractelement <4 x float> %tmp243, i32 3
   %tmp245 = fcmp oge float %temp30.0, %tmp244
   %tmp246 = sext i1 %tmp245 to i32
@@ -323,7 +411,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6
   %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7
   %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32>
-  %tmp277 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp276, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp276.bc = bitcast <8 x i32> %tmp276 to <8 x float>
+  %tmp277 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp276.bc, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp278 = extractelement <4 x float> %tmp277, i32 0
   %tmp279 = extractelement <4 x float> %tmp277, i32 1
   %tmp280 = extractelement <4 x float> %tmp277, i32 2
@@ -344,7 +433,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6
   %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7
   %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32>
-  %tmp297 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp296, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp296.bc = bitcast <8 x i32> %tmp296 to <8 x float>
+  %tmp297 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp296.bc, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp298 = extractelement <4 x float> %tmp297, i32 0
   %tmp299 = extractelement <4 x float> %tmp297, i32 1
   %tmp300 = extractelement <4 x float> %tmp297, i32 2
@@ -363,7 +453,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6
   %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7
   %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32>
-  %tmp315 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp314, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp314.bc = bitcast <8 x i32> %tmp314 to <8 x float>
+  %tmp315 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp314.bc, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp316 = extractelement <4 x float> %tmp315, i32 0
   %tmp317 = extractelement <4 x float> %tmp315, i32 1
   %tmp318 = extractelement <4 x float> %tmp315, i32 2
@@ -393,7 +484,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp342 = insertelement <8 x i32> %tmp341, i32 %tmp336, i32 5
   %tmp343 = insertelement <8 x i32> %tmp342, i32 undef, i32 6
   %tmp344 = insertelement <8 x i32> %tmp343, i32 undef, i32 7
-  %tmp345 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp344, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp344.bc = bitcast <8 x i32> %tmp344 to <8 x float>
+  %tmp345 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp344.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp346 = extractelement <4 x float> %tmp345, i32 0
   %tmp347 = extractelement <4 x float> %tmp345, i32 1
   %tmp348 = extractelement <4 x float> %tmp345, i32 2
@@ -424,14 +516,15 @@ IF67:                                             ; preds = %LOOP65
   %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6
   %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7
   %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32>
-  %tmp375 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp374, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp374.bc = bitcast <8 x i32> %tmp374 to <8 x float>
+  %tmp375 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp374.bc, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp376 = extractelement <4 x float> %tmp375, i32 0
   %tmp377 = extractelement <4 x float> %tmp375, i32 1
   %tmp378 = extractelement <4 x float> %tmp375, i32 2
   %tmp379 = extractelement <4 x float> %tmp375, i32 3
-  %tmp380 = fsub float -0.000000e+00, %tmp94
-  %tmp381 = fsub float -0.000000e+00, %tmp95
-  %tmp382 = fsub float -0.000000e+00, %tmp96
+  %tmp380 = fsub float -0.000000e+00, %p2.i90
+  %tmp381 = fsub float -0.000000e+00, %p2.i84
+  %tmp382 = fsub float -0.000000e+00, %p2.i78
   %tmp383 = fmul float %tmp358, %tmp380
   %tmp384 = fmul float %tmp359, %tmp381
   %tmp385 = fadd float %tmp384, %tmp383
@@ -449,20 +542,20 @@ IF67:                                             ; preds = %LOOP65
   %tmp397 = fadd float %tmp381, %tmp396
   %tmp398 = fsub float -0.000000e+00, %tmp393
   %tmp399 = fadd float %tmp382, %tmp398
-  %tmp400 = fmul float %tmp395, %tmp97
-  %tmp401 = fmul float %tmp395, %tmp98
-  %tmp402 = fmul float %tmp395, %tmp99
-  %tmp403 = fmul float %tmp397, %tmp100
+  %tmp400 = fmul float %tmp395, %p2.i72
+  %tmp401 = fmul float %tmp395, %p2.i66
+  %tmp402 = fmul float %tmp395, %p2.i60
+  %tmp403 = fmul float %tmp397, %p2.i54
   %tmp404 = fadd float %tmp403, %tmp400
-  %tmp405 = fmul float %tmp397, %tmp101
+  %tmp405 = fmul float %tmp397, %p2.i48
   %tmp406 = fadd float %tmp405, %tmp401
-  %tmp407 = fmul float %tmp397, %tmp102
+  %tmp407 = fmul float %tmp397, %p2.i42
   %tmp408 = fadd float %tmp407, %tmp402
-  %tmp409 = fmul float %tmp399, %tmp103
+  %tmp409 = fmul float %tmp399, %p2.i36
   %tmp410 = fadd float %tmp409, %tmp404
-  %tmp411 = fmul float %tmp399, %tmp104
+  %tmp411 = fmul float %tmp399, %p2.i30
   %tmp412 = fadd float %tmp411, %tmp406
-  %tmp413 = fmul float %tmp399, %tmp105
+  %tmp413 = fmul float %tmp399, %p2.i24
   %tmp414 = fadd float %tmp413, %tmp408
   %tmp415 = bitcast float %tmp135 to i32
   %tmp416 = bitcast float %tmp181 to i32
@@ -479,7 +572,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6
   %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7
   %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32>
-  %tmp429 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp428, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp428.bc = bitcast <8 x i32> %tmp428 to <8 x float>
+  %tmp429 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp428.bc, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp430 = extractelement <4 x float> %tmp429, i32 0
   %tmp431 = extractelement <4 x float> %tmp429, i32 1
   %tmp432 = extractelement <4 x float> %tmp429, i32 2
@@ -502,12 +596,22 @@ IF67:                                             ; preds = %LOOP65
   %tmp449 = insertelement <4 x float> %tmp448, float %tmp445, i32 1
   %tmp450 = insertelement <4 x float> %tmp449, float %tmp447, i32 2
   %tmp451 = insertelement <4 x float> %tmp450, float %tmp194, i32 3
-  %tmp452 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp451)
+  %tmp451.x = extractelement <4 x float> %tmp451, i32 0
+  %tmp451.y = extractelement <4 x float> %tmp451, i32 1
+  %tmp451.z = extractelement <4 x float> %tmp451, i32 2
+  %cubetc = call float @llvm.amdgcn.cubetc(float %tmp451.x, float %tmp451.y, float %tmp451.z)
+  %cubesc = call float @llvm.amdgcn.cubesc(float %tmp451.x, float %tmp451.y, float %tmp451.z)
+  %cubema = call float @llvm.amdgcn.cubema(float %tmp451.x, float %tmp451.y, float %tmp451.z)
+  %cubeid = call float @llvm.amdgcn.cubeid(float %tmp451.x, float %tmp451.y, float %tmp451.z)
+  %tmp452.0 = insertelement <4 x float> undef, float %cubetc, i32 0
+  %tmp452.1 = insertelement <4 x float> %tmp452.0, float %cubesc, i32 1
+  %tmp452.2 = insertelement <4 x float> %tmp452.1, float %cubema, i32 2
+  %tmp452 = insertelement <4 x float> %tmp452.2, float %cubeid, i32 3
   %tmp453 = extractelement <4 x float> %tmp452, i32 0
   %tmp454 = extractelement <4 x float> %tmp452, i32 1
   %tmp455 = extractelement <4 x float> %tmp452, i32 2
   %tmp456 = extractelement <4 x float> %tmp452, i32 3
-  %tmp457 = call float @fabs(float %tmp455)
+  %tmp457 = call float @llvm.fabs.f32(float %tmp455)
   %tmp458 = fdiv float 1.000000e+00, %tmp457
   %tmp459 = fmul float %tmp453, %tmp458
   %tmp460 = fadd float %tmp459, 1.500000e+00
@@ -521,7 +625,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2
   %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3
   %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32>
-  %tmp470 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp469, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp469.bc = bitcast <4 x i32> %tmp469 to <4 x float>
+  %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp469.bc, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %tmp471 = extractelement <4 x float> %tmp470, i32 0
   %tmp472 = extractelement <4 x float> %tmp470, i32 1
   %tmp473 = extractelement <4 x float> %tmp470, i32 2
@@ -531,15 +636,15 @@ IF67:                                             ; preds = %LOOP65
   %tmp477 = fadd float %tmp476, %tmp329
   %tmp478 = fmul float %tmp432, %tmp473
   %tmp479 = fadd float %tmp478, %tmp330
-  %tmp480 = fmul float %tmp106, %tmp106
-  %tmp481 = fmul float %tmp107, %tmp107
+  %tmp480 = fmul float %p2.i18, %p2.i18
+  %tmp481 = fmul float %p2.i12, %p2.i12
   %tmp482 = fadd float %tmp481, %tmp480
-  %tmp483 = fmul float %tmp108, %tmp108
+  %tmp483 = fmul float %p2.i6, %p2.i6
   %tmp484 = fadd float %tmp482, %tmp483
   %tmp485 = call float @llvm.amdgcn.rsq.f32(float %tmp484)
-  %tmp486 = fmul float %tmp106, %tmp485
-  %tmp487 = fmul float %tmp107, %tmp485
-  %tmp488 = fmul float %tmp108, %tmp485
+  %tmp486 = fmul float %p2.i18, %tmp485
+  %tmp487 = fmul float %p2.i12, %tmp485
+  %tmp488 = fmul float %p2.i6, %tmp485
   %tmp489 = fmul float %tmp376, %tmp39
   %tmp490 = fmul float %tmp377, %tmp40
   %tmp491 = fmul float %tmp378, %tmp41
@@ -560,15 +665,15 @@ IF67:                                             ; preds = %LOOP65
   %tmp506 = fadd float %tmp487, %tmp505
   %tmp507 = fsub float -0.000000e+00, %tmp502
   %tmp508 = fadd float %tmp488, %tmp507
-  %tmp509 = fmul float %tmp94, %tmp94
-  %tmp510 = fmul float %tmp95, %tmp95
+  %tmp509 = fmul float %p2.i90, %p2.i90
+  %tmp510 = fmul float %p2.i84, %p2.i84
   %tmp511 = fadd float %tmp510, %tmp509
-  %tmp512 = fmul float %tmp96, %tmp96
+  %tmp512 = fmul float %p2.i78, %p2.i78
   %tmp513 = fadd float %tmp511, %tmp512
   %tmp514 = call float @llvm.amdgcn.rsq.f32(float %tmp513)
-  %tmp515 = fmul float %tmp94, %tmp514
-  %tmp516 = fmul float %tmp95, %tmp514
-  %tmp517 = fmul float %tmp96, %tmp514
+  %tmp515 = fmul float %p2.i90, %tmp514
+  %tmp516 = fmul float %p2.i84, %tmp514
+  %tmp517 = fmul float %p2.i78, %tmp514
   %tmp518 = fmul float %tmp504, %tmp515
   %tmp519 = fmul float %tmp506, %tmp516
   %tmp520 = fadd float %tmp519, %tmp518
@@ -623,7 +728,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6
   %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7
   %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32>
-  %tmp571 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp570, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp570.bc = bitcast <8 x i32> %tmp570 to <8 x float>
+  %tmp571 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp570.bc, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp572 = extractelement <4 x float> %tmp571, i32 0
   %tmp573 = extractelement <4 x float> %tmp571, i32 1
   %tmp574 = extractelement <4 x float> %tmp571, i32 2
@@ -633,11 +739,9 @@ IF67:                                             ; preds = %LOOP65
   %tmp578 = fadd float %tmp577, %tmp554
   %tmp579 = fmul float %tmp574, %tmp45
   %tmp580 = fadd float %tmp579, %tmp556
-  %tmp581 = call i32 @llvm.SI.packf16(float %tmp576, float %tmp578)
-  %tmp582 = bitcast i32 %tmp581 to float
-  %tmp583 = call i32 @llvm.SI.packf16(float %tmp580, float %tmp282)
-  %tmp584 = bitcast i32 %tmp583 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp582, float %tmp584, float %tmp582, float %tmp584)
+  %tmp581 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp576, float %tmp578)
+  %tmp583 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp580, float %tmp282)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp581, <2 x half> %tmp583, i1 true, i1 true) #0
   ret void
 
 ENDIF66:                                          ; preds = %LOOP65
@@ -647,7 +751,8 @@ ENDIF66:                                          ; preds = %LOOP65
   %tmp588 = insertelement <8 x i32> %tmp587, i32 %tmp586, i32 5
   %tmp589 = insertelement <8 x i32> %tmp588, i32 undef, i32 6
   %tmp590 = insertelement <8 x i32> %tmp589, i32 undef, i32 7
-  %tmp591 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp590, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp590.bc = bitcast <8 x i32> %tmp590 to <8 x float>
+  %tmp591 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp590.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp592 = extractelement <4 x float> %tmp591, i32 3
   %tmp593 = fcmp oge float %temp30.1, %tmp592
   %tmp594 = sext i1 %tmp593 to i32
@@ -670,9 +775,10 @@ ENDIF66:                                          ; preds = %LOOP65
   br label %LOOP65
 }
 
-; CHECK-LABEL: {{^}}main1:
-; CHECK: s_endpgm
-define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
+; GCN-LABEL: {{^}}main1:
+; GCN: s_endpgm
+; TOVGPR: ScratchSize: 0{{$}}
+define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 main_body:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
   %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -817,52 +923,210 @@ main_body:
   %tmp160 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp159, !tbaa !0
   %tmp161 = fcmp ugt float %arg17, 0.000000e+00
   %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00
-  %tmp163 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp164 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp165 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp166 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp167 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp168 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp169 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp170 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp171 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp172 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp173 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp174 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp175 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp176 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp177 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp178 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp179 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp180 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp181 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp182 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp183 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp184 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp185 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp186 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp187 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %arg4, <2 x i32> %arg6)
-  %tmp188 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %arg4, <2 x i32> %arg6)
-  %tmp189 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %arg4, <2 x i32> %arg6)
-  %tmp190 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %arg4, <2 x i32> %arg6)
-  %tmp191 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %arg4, <2 x i32> %arg6)
-  %tmp192 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %arg4, <2 x i32> %arg6)
-  %tmp193 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %arg4, <2 x i32> %arg6)
-  %tmp194 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %arg4, <2 x i32> %arg6)
+  %i.i = extractelement <2 x i32> %arg6, i32 0
+  %j.i = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg4) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg4) #0
+  %i.i181 = extractelement <2 x i32> %arg6, i32 0
+  %j.i182 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i183 = bitcast i32 %i.i181 to float
+  %j.f.i184 = bitcast i32 %j.i182 to float
+  %p1.i185 = call float @llvm.amdgcn.interp.p1(float %i.f.i183, i32 1, i32 0, i32 %arg4) #0
+  %p2.i186 = call float @llvm.amdgcn.interp.p2(float %p1.i185, float %j.f.i184, i32 1, i32 0, i32 %arg4) #0
+  %i.i175 = extractelement <2 x i32> %arg6, i32 0
+  %j.i176 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i177 = bitcast i32 %i.i175 to float
+  %j.f.i178 = bitcast i32 %j.i176 to float
+  %p1.i179 = call float @llvm.amdgcn.interp.p1(float %i.f.i177, i32 2, i32 0, i32 %arg4) #0
+  %p2.i180 = call float @llvm.amdgcn.interp.p2(float %p1.i179, float %j.f.i178, i32 2, i32 0, i32 %arg4) #0
+  %i.i169 = extractelement <2 x i32> %arg6, i32 0
+  %j.i170 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i171 = bitcast i32 %i.i169 to float
+  %j.f.i172 = bitcast i32 %j.i170 to float
+  %p1.i173 = call float @llvm.amdgcn.interp.p1(float %i.f.i171, i32 3, i32 0, i32 %arg4) #0
+  %p2.i174 = call float @llvm.amdgcn.interp.p2(float %p1.i173, float %j.f.i172, i32 3, i32 0, i32 %arg4) #0
+  %i.i163 = extractelement <2 x i32> %arg6, i32 0
+  %j.i164 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i165 = bitcast i32 %i.i163 to float
+  %j.f.i166 = bitcast i32 %j.i164 to float
+  %p1.i167 = call float @llvm.amdgcn.interp.p1(float %i.f.i165, i32 0, i32 1, i32 %arg4) #0
+  %p2.i168 = call float @llvm.amdgcn.interp.p2(float %p1.i167, float %j.f.i166, i32 0, i32 1, i32 %arg4) #0
+  %i.i157 = extractelement <2 x i32> %arg6, i32 0
+  %j.i158 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i159 = bitcast i32 %i.i157 to float
+  %j.f.i160 = bitcast i32 %j.i158 to float
+  %p1.i161 = call float @llvm.amdgcn.interp.p1(float %i.f.i159, i32 1, i32 1, i32 %arg4) #0
+  %p2.i162 = call float @llvm.amdgcn.interp.p2(float %p1.i161, float %j.f.i160, i32 1, i32 1, i32 %arg4) #0
+  %i.i151 = extractelement <2 x i32> %arg6, i32 0
+  %j.i152 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i153 = bitcast i32 %i.i151 to float
+  %j.f.i154 = bitcast i32 %j.i152 to float
+  %p1.i155 = call float @llvm.amdgcn.interp.p1(float %i.f.i153, i32 2, i32 1, i32 %arg4) #0
+  %p2.i156 = call float @llvm.amdgcn.interp.p2(float %p1.i155, float %j.f.i154, i32 2, i32 1, i32 %arg4) #0
+  %i.i145 = extractelement <2 x i32> %arg6, i32 0
+  %j.i146 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i147 = bitcast i32 %i.i145 to float
+  %j.f.i148 = bitcast i32 %j.i146 to float
+  %p1.i149 = call float @llvm.amdgcn.interp.p1(float %i.f.i147, i32 3, i32 1, i32 %arg4) #0
+  %p2.i150 = call float @llvm.amdgcn.interp.p2(float %p1.i149, float %j.f.i148, i32 3, i32 1, i32 %arg4) #0
+  %i.i139 = extractelement <2 x i32> %arg6, i32 0
+  %j.i140 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i141 = bitcast i32 %i.i139 to float
+  %j.f.i142 = bitcast i32 %j.i140 to float
+  %p1.i143 = call float @llvm.amdgcn.interp.p1(float %i.f.i141, i32 0, i32 2, i32 %arg4) #0
+  %p2.i144 = call float @llvm.amdgcn.interp.p2(float %p1.i143, float %j.f.i142, i32 0, i32 2, i32 %arg4) #0
+  %i.i133 = extractelement <2 x i32> %arg6, i32 0
+  %j.i134 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i135 = bitcast i32 %i.i133 to float
+  %j.f.i136 = bitcast i32 %j.i134 to float
+  %p1.i137 = call float @llvm.amdgcn.interp.p1(float %i.f.i135, i32 1, i32 2, i32 %arg4) #0
+  %p2.i138 = call float @llvm.amdgcn.interp.p2(float %p1.i137, float %j.f.i136, i32 1, i32 2, i32 %arg4) #0
+  %i.i127 = extractelement <2 x i32> %arg6, i32 0
+  %j.i128 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i129 = bitcast i32 %i.i127 to float
+  %j.f.i130 = bitcast i32 %j.i128 to float
+  %p1.i131 = call float @llvm.amdgcn.interp.p1(float %i.f.i129, i32 2, i32 2, i32 %arg4) #0
+  %p2.i132 = call float @llvm.amdgcn.interp.p2(float %p1.i131, float %j.f.i130, i32 2, i32 2, i32 %arg4) #0
+  %i.i121 = extractelement <2 x i32> %arg6, i32 0
+  %j.i122 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i123 = bitcast i32 %i.i121 to float
+  %j.f.i124 = bitcast i32 %j.i122 to float
+  %p1.i125 = call float @llvm.amdgcn.interp.p1(float %i.f.i123, i32 3, i32 2, i32 %arg4) #0
+  %p2.i126 = call float @llvm.amdgcn.interp.p2(float %p1.i125, float %j.f.i124, i32 3, i32 2, i32 %arg4) #0
+  %i.i115 = extractelement <2 x i32> %arg6, i32 0
+  %j.i116 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i117 = bitcast i32 %i.i115 to float
+  %j.f.i118 = bitcast i32 %j.i116 to float
+  %p1.i119 = call float @llvm.amdgcn.interp.p1(float %i.f.i117, i32 0, i32 3, i32 %arg4) #0
+  %p2.i120 = call float @llvm.amdgcn.interp.p2(float %p1.i119, float %j.f.i118, i32 0, i32 3, i32 %arg4) #0
+  %i.i109 = extractelement <2 x i32> %arg6, i32 0
+  %j.i110 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i111 = bitcast i32 %i.i109 to float
+  %j.f.i112 = bitcast i32 %j.i110 to float
+  %p1.i113 = call float @llvm.amdgcn.interp.p1(float %i.f.i111, i32 1, i32 3, i32 %arg4) #0
+  %p2.i114 = call float @llvm.amdgcn.interp.p2(float %p1.i113, float %j.f.i112, i32 1, i32 3, i32 %arg4) #0
+  %i.i103 = extractelement <2 x i32> %arg6, i32 0
+  %j.i104 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i105 = bitcast i32 %i.i103 to float
+  %j.f.i106 = bitcast i32 %j.i104 to float
+  %p1.i107 = call float @llvm.amdgcn.interp.p1(float %i.f.i105, i32 2, i32 3, i32 %arg4) #0
+  %p2.i108 = call float @llvm.amdgcn.interp.p2(float %p1.i107, float %j.f.i106, i32 2, i32 3, i32 %arg4) #0
+  %i.i97 = extractelement <2 x i32> %arg6, i32 0
+  %j.i98 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i99 = bitcast i32 %i.i97 to float
+  %j.f.i100 = bitcast i32 %j.i98 to float
+  %p1.i101 = call float @llvm.amdgcn.interp.p1(float %i.f.i99, i32 3, i32 3, i32 %arg4) #0
+  %p2.i102 = call float @llvm.amdgcn.interp.p2(float %p1.i101, float %j.f.i100, i32 3, i32 3, i32 %arg4) #0
+  %i.i91 = extractelement <2 x i32> %arg6, i32 0
+  %j.i92 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i93 = bitcast i32 %i.i91 to float
+  %j.f.i94 = bitcast i32 %j.i92 to float
+  %p1.i95 = call float @llvm.amdgcn.interp.p1(float %i.f.i93, i32 0, i32 4, i32 %arg4) #0
+  %p2.i96 = call float @llvm.amdgcn.interp.p2(float %p1.i95, float %j.f.i94, i32 0, i32 4, i32 %arg4) #0
+  %i.i85 = extractelement <2 x i32> %arg6, i32 0
+  %j.i86 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i87 = bitcast i32 %i.i85 to float
+  %j.f.i88 = bitcast i32 %j.i86 to float
+  %p1.i89 = call float @llvm.amdgcn.interp.p1(float %i.f.i87, i32 1, i32 4, i32 %arg4) #0
+  %p2.i90 = call float @llvm.amdgcn.interp.p2(float %p1.i89, float %j.f.i88, i32 1, i32 4, i32 %arg4) #0
+  %i.i79 = extractelement <2 x i32> %arg6, i32 0
+  %j.i80 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i81 = bitcast i32 %i.i79 to float
+  %j.f.i82 = bitcast i32 %j.i80 to float
+  %p1.i83 = call float @llvm.amdgcn.interp.p1(float %i.f.i81, i32 2, i32 4, i32 %arg4) #0
+  %p2.i84 = call float @llvm.amdgcn.interp.p2(float %p1.i83, float %j.f.i82, i32 2, i32 4, i32 %arg4) #0
+  %i.i73 = extractelement <2 x i32> %arg6, i32 0
+  %j.i74 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i75 = bitcast i32 %i.i73 to float
+  %j.f.i76 = bitcast i32 %j.i74 to float
+  %p1.i77 = call float @llvm.amdgcn.interp.p1(float %i.f.i75, i32 3, i32 4, i32 %arg4) #0
+  %p2.i78 = call float @llvm.amdgcn.interp.p2(float %p1.i77, float %j.f.i76, i32 3, i32 4, i32 %arg4) #0
+  %i.i67 = extractelement <2 x i32> %arg6, i32 0
+  %j.i68 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i69 = bitcast i32 %i.i67 to float
+  %j.f.i70 = bitcast i32 %j.i68 to float
+  %p1.i71 = call float @llvm.amdgcn.interp.p1(float %i.f.i69, i32 0, i32 5, i32 %arg4) #0
+  %p2.i72 = call float @llvm.amdgcn.interp.p2(float %p1.i71, float %j.f.i70, i32 0, i32 5, i32 %arg4) #0
+  %i.i61 = extractelement <2 x i32> %arg6, i32 0
+  %j.i62 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i63 = bitcast i32 %i.i61 to float
+  %j.f.i64 = bitcast i32 %j.i62 to float
+  %p1.i65 = call float @llvm.amdgcn.interp.p1(float %i.f.i63, i32 1, i32 5, i32 %arg4) #0
+  %p2.i66 = call float @llvm.amdgcn.interp.p2(float %p1.i65, float %j.f.i64, i32 1, i32 5, i32 %arg4) #0
+  %i.i55 = extractelement <2 x i32> %arg6, i32 0
+  %j.i56 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i57 = bitcast i32 %i.i55 to float
+  %j.f.i58 = bitcast i32 %j.i56 to float
+  %p1.i59 = call float @llvm.amdgcn.interp.p1(float %i.f.i57, i32 2, i32 5, i32 %arg4) #0
+  %p2.i60 = call float @llvm.amdgcn.interp.p2(float %p1.i59, float %j.f.i58, i32 2, i32 5, i32 %arg4) #0
+  %i.i49 = extractelement <2 x i32> %arg6, i32 0
+  %j.i50 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i51 = bitcast i32 %i.i49 to float
+  %j.f.i52 = bitcast i32 %j.i50 to float
+  %p1.i53 = call float @llvm.amdgcn.interp.p1(float %i.f.i51, i32 3, i32 5, i32 %arg4) #0
+  %p2.i54 = call float @llvm.amdgcn.interp.p2(float %p1.i53, float %j.f.i52, i32 3, i32 5, i32 %arg4) #0
+  %i.i43 = extractelement <2 x i32> %arg6, i32 0
+  %j.i44 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i45 = bitcast i32 %i.i43 to float
+  %j.f.i46 = bitcast i32 %j.i44 to float
+  %p1.i47 = call float @llvm.amdgcn.interp.p1(float %i.f.i45, i32 0, i32 6, i32 %arg4) #0
+  %p2.i48 = call float @llvm.amdgcn.interp.p2(float %p1.i47, float %j.f.i46, i32 0, i32 6, i32 %arg4) #0
+  %i.i37 = extractelement <2 x i32> %arg6, i32 0
+  %j.i38 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i39 = bitcast i32 %i.i37 to float
+  %j.f.i40 = bitcast i32 %j.i38 to float
+  %p1.i41 = call float @llvm.amdgcn.interp.p1(float %i.f.i39, i32 1, i32 6, i32 %arg4) #0
+  %p2.i42 = call float @llvm.amdgcn.interp.p2(float %p1.i41, float %j.f.i40, i32 1, i32 6, i32 %arg4) #0
+  %i.i31 = extractelement <2 x i32> %arg6, i32 0
+  %j.i32 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i33 = bitcast i32 %i.i31 to float
+  %j.f.i34 = bitcast i32 %j.i32 to float
+  %p1.i35 = call float @llvm.amdgcn.interp.p1(float %i.f.i33, i32 2, i32 6, i32 %arg4) #0
+  %p2.i36 = call float @llvm.amdgcn.interp.p2(float %p1.i35, float %j.f.i34, i32 2, i32 6, i32 %arg4) #0
+  %i.i25 = extractelement <2 x i32> %arg6, i32 0
+  %j.i26 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i27 = bitcast i32 %i.i25 to float
+  %j.f.i28 = bitcast i32 %j.i26 to float
+  %p1.i29 = call float @llvm.amdgcn.interp.p1(float %i.f.i27, i32 3, i32 6, i32 %arg4) #0
+  %p2.i30 = call float @llvm.amdgcn.interp.p2(float %p1.i29, float %j.f.i28, i32 3, i32 6, i32 %arg4) #0
+  %i.i19 = extractelement <2 x i32> %arg6, i32 0
+  %j.i20 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i21 = bitcast i32 %i.i19 to float
+  %j.f.i22 = bitcast i32 %j.i20 to float
+  %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 0, i32 7, i32 %arg4) #0
+  %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 0, i32 7, i32 %arg4) #0
+  %i.i13 = extractelement <2 x i32> %arg6, i32 0
+  %j.i14 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i15 = bitcast i32 %i.i13 to float
+  %j.f.i16 = bitcast i32 %j.i14 to float
+  %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 1, i32 7, i32 %arg4) #0
+  %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 1, i32 7, i32 %arg4) #0
+  %i.i7 = extractelement <2 x i32> %arg6, i32 0
+  %j.i8 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i9 = bitcast i32 %i.i7 to float
+  %j.f.i10 = bitcast i32 %j.i8 to float
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 2, i32 7, i32 %arg4) #0
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 2, i32 7, i32 %arg4) #0
+  %i.i1 = extractelement <2 x i32> %arg6, i32 0
+  %j.i2 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 3, i32 7, i32 %arg4) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 3, i32 7, i32 %arg4) #0
   %tmp195 = fmul float %arg14, %tmp123
   %tmp196 = fadd float %tmp195, %tmp124
-  %tmp197 = call float @llvm.AMDGPU.clamp.f32(float %tmp162, float 0.000000e+00, float 1.000000e+00)
-  %tmp198 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %tmp199 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %tmp200 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %tmp201 = bitcast float %tmp197 to i32
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp162, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %tmp201 = bitcast float %clamp.i to i32
   %tmp202 = icmp ne i32 %tmp201, 0
   %. = select i1 %tmp202, float -1.000000e+00, float 1.000000e+00
-  %tmp203 = fsub float -0.000000e+00, %tmp163
+  %tmp203 = fsub float -0.000000e+00, %p2.i
   %tmp204 = fadd float %tmp43, %tmp203
-  %tmp205 = fsub float -0.000000e+00, %tmp164
+  %tmp205 = fsub float -0.000000e+00, %p2.i186
   %tmp206 = fadd float %tmp44, %tmp205
-  %tmp207 = fsub float -0.000000e+00, %tmp165
+  %tmp207 = fsub float -0.000000e+00, %p2.i180
   %tmp208 = fadd float %tmp45, %tmp207
   %tmp209 = fmul float %tmp204, %tmp204
   %tmp210 = fmul float %tmp206, %tmp206
@@ -876,12 +1140,13 @@ main_body:
   %tmp218 = fmul float %., %tmp53
   %tmp219 = fmul float %arg13, %tmp46
   %tmp220 = fmul float %tmp196, %tmp47
-  %tmp221 = bitcast float %tmp173 to i32
-  %tmp222 = bitcast float %tmp174 to i32
+  %tmp221 = bitcast float %p2.i132 to i32
+  %tmp222 = bitcast float %p2.i126 to i32
   %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0
   %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1
   %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32>
-  %tmp225 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp224, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp224.bc = bitcast <2 x i32> %tmp224 to <2 x float>
+  %tmp225 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp224.bc, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp226 = extractelement <4 x float> %tmp225, i32 0
   %tmp227 = extractelement <4 x float> %tmp225, i32 1
   %tmp228 = extractelement <4 x float> %tmp225, i32 2
@@ -895,34 +1160,36 @@ main_body:
   %result.i44 = fadd float %tmp231, %one.sub.a.i43
   %one.sub.a.i41 = fsub float 1.000000e+00, %tmp26
   %result.i42 = fadd float %tmp232, %one.sub.a.i41
-  %tmp233 = fmul float %tmp215, %tmp183
-  %tmp234 = fmul float %tmp216, %tmp184
+  %tmp233 = fmul float %tmp215, %p2.i72
+  %tmp234 = fmul float %tmp216, %p2.i66
   %tmp235 = fadd float %tmp234, %tmp233
-  %tmp236 = fmul float %tmp217, %tmp185
+  %tmp236 = fmul float %tmp217, %p2.i60
   %tmp237 = fadd float %tmp235, %tmp236
-  %tmp238 = fmul float %tmp215, %tmp186
-  %tmp239 = fmul float %tmp216, %tmp187
+  %tmp238 = fmul float %tmp215, %p2.i54
+  %tmp239 = fmul float %tmp216, %p2.i48
   %tmp240 = fadd float %tmp239, %tmp238
-  %tmp241 = fmul float %tmp217, %tmp188
+  %tmp241 = fmul float %tmp217, %p2.i42
   %tmp242 = fadd float %tmp240, %tmp241
-  %tmp243 = fmul float %tmp215, %tmp189
-  %tmp244 = fmul float %tmp216, %tmp190
+  %tmp243 = fmul float %tmp215, %p2.i36
+  %tmp244 = fmul float %tmp216, %p2.i30
   %tmp245 = fadd float %tmp244, %tmp243
-  %tmp246 = fmul float %tmp217, %tmp191
+  %tmp246 = fmul float %tmp217, %p2.i24
   %tmp247 = fadd float %tmp245, %tmp246
-  %tmp248 = call float @llvm.AMDGPU.clamp.f32(float %tmp247, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i19 = call float @llvm.maxnum.f32(float %tmp247, float 0.000000e+00)
+  %clamp.i20 = call float @llvm.minnum.f32(float %max.0.i19, float 1.000000e+00)
   %tmp249 = fmul float %tmp213, 0x3F5A36E2E0000000
-  %tmp250 = call float @llvm.AMDGPU.clamp.f32(float %tmp249, float 0.000000e+00, float 1.000000e+00)
-  %tmp251 = fsub float -0.000000e+00, %tmp250
+  %max.0.i17 = call float @llvm.maxnum.f32(float %tmp249, float 0.000000e+00)
+  %clamp.i18 = call float @llvm.minnum.f32(float %max.0.i17, float 1.000000e+00)
+  %tmp251 = fsub float -0.000000e+00, %clamp.i18
   %tmp252 = fadd float 1.000000e+00, %tmp251
-  %tmp253 = call float @llvm.pow.f32(float %tmp248, float 2.500000e-01)
+  %tmp253 = call float @llvm.pow.f32(float %clamp.i20, float 2.500000e-01)
   %tmp254 = fmul float %tmp38, %tmp253
   %tmp255 = fmul float %tmp237, %tmp254
   %tmp256 = fmul float %tmp242, %tmp254
   %tmp257 = fmul float %tmp255, %tmp229
   %tmp258 = fmul float %tmp256, %tmp229
-  %tmp259 = fadd float %tmp248, 0x3EE4F8B580000000
-  %tmp260 = fsub float -0.000000e+00, %tmp248
+  %tmp259 = fadd float %clamp.i20, 0x3EE4F8B580000000
+  %tmp260 = fsub float -0.000000e+00, %clamp.i20
   %tmp261 = fadd float 1.000000e+00, %tmp260
   %tmp262 = fmul float 1.200000e+01, %tmp261
   %tmp263 = fadd float %tmp262, 4.000000e+00
@@ -942,8 +1209,8 @@ main_body:
 
 LOOP:                                             ; preds = %LOOP, %main_body
   %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %tmp288, %LOOP ]
-  %temp168.0 = phi float [ %tmp175, %main_body ], [ %tmp284, %LOOP ]
-  %temp169.0 = phi float [ %tmp176, %main_body ], [ %tmp285, %LOOP ]
+  %temp168.0 = phi float [ %p2.i120, %main_body ], [ %tmp284, %LOOP ]
+  %temp169.0 = phi float [ %p2.i114, %main_body ], [ %tmp285, %LOOP ]
   %temp170.0 = phi float [ %tmp252, %main_body ], [ %tmp286, %LOOP ]
   %tmp276 = bitcast float %temp168.0 to i32
   %tmp277 = bitcast float %temp169.0 to i32
@@ -952,7 +1219,8 @@ LOOP:                                             ; preds = %LOOP, %main_body
   %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2
   %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3
   %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32>
-  %tmp282 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp281, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp281.bc = bitcast <4 x i32> %tmp281 to <4 x float>
+  %tmp282 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp281.bc, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp283 = extractelement <4 x float> %tmp282, i32 3
   %tmp284 = fadd float %temp168.0, %tmp273
   %tmp285 = fadd float %temp169.0, %tmp274
@@ -979,12 +1247,12 @@ IF189:                                            ; preds = %LOOP
   %tmp303 = fadd float %tmp302, %tmp284
   %tmp304 = fmul float %tmp301, %tmp274
   %tmp305 = fadd float %tmp304, %tmp285
-  %tmp306 = fsub float -0.000000e+00, %tmp175
+  %tmp306 = fsub float -0.000000e+00, %p2.i120
   %tmp307 = fadd float %tmp303, %tmp306
-  %tmp308 = fsub float -0.000000e+00, %tmp176
+  %tmp308 = fsub float -0.000000e+00, %p2.i114
   %tmp309 = fadd float %tmp305, %tmp308
-  %tmp310 = fadd float %tmp175, %tmp307
-  %tmp311 = fadd float %tmp176, %tmp309
+  %tmp310 = fadd float %p2.i120, %tmp307
+  %tmp311 = fadd float %p2.i114, %tmp309
   %tmp312 = fmul float %tmp307, %tmp66
   %tmp313 = fmul float %tmp309, %tmp67
   %tmp314 = fmul float %tmp312, %tmp54
@@ -993,8 +1261,8 @@ IF189:                                            ; preds = %LOOP
   %tmp317 = fadd float %tmp316, %tmp314
   %tmp318 = fmul float %tmp313, %tmp57
   %tmp319 = fadd float %tmp318, %tmp315
-  %tmp320 = fadd float %tmp177, %tmp317
-  %tmp321 = fadd float %tmp178, %tmp319
+  %tmp320 = fadd float %p2.i108, %tmp317
+  %tmp321 = fadd float %p2.i102, %tmp319
   %tmp322 = fmul float %tmp312, %tmp58
   %tmp323 = fmul float %tmp312, %tmp59
   %tmp324 = fmul float %tmp312, %tmp60
@@ -1007,28 +1275,29 @@ IF189:                                            ; preds = %LOOP
   %tmp331 = fadd float %tmp330, %tmp324
   %tmp332 = fmul float %tmp313, %tmp65
   %tmp333 = fadd float %tmp332, %tmp325
-  %tmp334 = fadd float %tmp167, %tmp327
-  %tmp335 = fadd float %tmp168, %tmp329
-  %tmp336 = fadd float %tmp169, %tmp331
-  %tmp337 = fadd float %tmp170, %tmp333
+  %tmp334 = fadd float %p2.i168, %tmp327
+  %tmp335 = fadd float %p2.i162, %tmp329
+  %tmp336 = fadd float %p2.i156, %tmp331
+  %tmp337 = fadd float %p2.i150, %tmp333
   %tmp338 = bitcast float %tmp334 to i32
   %tmp339 = bitcast float %tmp335 to i32
   %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0
   %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1
   %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32>
-  %tmp342 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp341, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp343 = extractelement <4 x float> %tmp342, i32 0
-  %tmp344 = extractelement <4 x float> %tmp342, i32 1
-  %tmp345 = extractelement <4 x float> %tmp342, i32 2
-  %tmp346 = extractelement <4 x float> %tmp342, i32 3
+  %a.bc.i = bitcast <2 x i32> %tmp341 to <2 x float>
+  %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp343 = extractelement <4 x float> %tmp0, i32 0
+  %tmp344 = extractelement <4 x float> %tmp0, i32 1
+  %tmp345 = extractelement <4 x float> %tmp0, i32 2
+  %tmp346 = extractelement <4 x float> %tmp0, i32 3
   %tmp347 = fmul float %tmp343, %tmp22
   %tmp348 = fmul float %tmp344, %tmp23
   %tmp349 = fmul float %tmp345, %tmp24
   %tmp350 = fmul float %tmp346, %tmp25
-  %tmp351 = fmul float %tmp347, %tmp179
-  %tmp352 = fmul float %tmp348, %tmp180
-  %tmp353 = fmul float %tmp349, %tmp181
-  %tmp354 = fmul float %tmp350, %tmp182
+  %tmp351 = fmul float %tmp347, %p2.i96
+  %tmp352 = fmul float %tmp348, %p2.i90
+  %tmp353 = fmul float %tmp349, %p2.i84
+  %tmp354 = fmul float %tmp350, %p2.i78
   %tmp355 = fsub float -0.000000e+00, %tmp346
   %tmp356 = fadd float 1.000000e+00, %tmp355
   %tmp357 = fmul float %tmp356, %tmp48
@@ -1049,8 +1318,9 @@ IF189:                                            ; preds = %LOOP
   %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0
   %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1
   %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32>
-  %tmp362 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp361, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp363 = extractelement <4 x float> %tmp362, i32 2
+  %a.bc.i3 = bitcast <2 x i32> %tmp361 to <2 x float>
+  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i3, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp363 = extractelement <4 x float> %tmp1, i32 2
   %tmp364 = fmul float %result.i40, %result.i
   %tmp365 = fmul float %result.i36, %result.i44
   %tmp366 = fmul float %result.i32, %result.i42
@@ -1060,11 +1330,12 @@ IF189:                                            ; preds = %LOOP
   %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0
   %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1
   %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32>
-  %tmp372 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp371, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp373 = extractelement <4 x float> %tmp372, i32 0
-  %tmp374 = extractelement <4 x float> %tmp372, i32 1
-  %tmp375 = extractelement <4 x float> %tmp372, i32 2
-  %tmp376 = extractelement <4 x float> %tmp372, i32 3
+  %a.bc.i2 = bitcast <2 x i32> %tmp371 to <2 x float>
+  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i2, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp373 = extractelement <4 x float> %tmp2, i32 0
+  %tmp374 = extractelement <4 x float> %tmp2, i32 1
+  %tmp375 = extractelement <4 x float> %tmp2, i32 2
+  %tmp376 = extractelement <4 x float> %tmp2, i32 3
   %tmp377 = fcmp olt float 0.000000e+00, %tmp375
   %tmp378 = sext i1 %tmp377 to i32
   %tmp379 = bitcast i32 %tmp378 to float
@@ -1077,11 +1348,12 @@ IF189:                                            ; preds = %LOOP
   %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0
   %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1
   %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32>
-  %tmp386 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp385, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp387 = extractelement <4 x float> %tmp386, i32 0
-  %tmp388 = extractelement <4 x float> %tmp386, i32 1
-  %tmp389 = extractelement <4 x float> %tmp386, i32 2
-  %tmp390 = extractelement <4 x float> %tmp386, i32 3
+  %a.bc.i1 = bitcast <2 x i32> %tmp385 to <2 x float>
+  %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp387 = extractelement <4 x float> %tmp3, i32 0
+  %tmp388 = extractelement <4 x float> %tmp3, i32 1
+  %tmp389 = extractelement <4 x float> %tmp3, i32 2
+  %tmp390 = extractelement <4 x float> %tmp3, i32 3
   %tmp391 = fcmp olt float 0.000000e+00, %tmp389
   %tmp392 = sext i1 %tmp391 to i32
   %tmp393 = bitcast i32 %tmp392 to float
@@ -1107,8 +1379,8 @@ IF189:                                            ; preds = %LOOP
   %tmp411 = fmul float %tmp410, %tmp35
   %tmp412 = fmul float %tmp409, %tmp363
   %tmp413 = fmul float %tmp411, %tmp363
-  %tmp414 = call float @fabs(float %tmp405)
-  %tmp415 = call float @fabs(float %tmp407)
+  %tmp414 = call float @llvm.fabs.f32(float %tmp405)
+  %tmp415 = call float @llvm.fabs.f32(float %tmp407)
   %tmp416 = fsub float -0.000000e+00, %tmp414
   %tmp417 = fadd float 1.000000e+00, %tmp416
   %tmp418 = fsub float -0.000000e+00, %tmp415
@@ -1122,26 +1394,27 @@ IF189:                                            ; preds = %LOOP
   %tmp426 = fadd float %tmp424, %tmp425
   %tmp427 = fsub float -0.000000e+00, %tmp426
   %tmp428 = fadd float 0x3FF00068E0000000, %tmp427
-  %tmp429 = call float @llvm.AMDGPU.clamp.f32(float %tmp428, float 0.000000e+00, float 1.000000e+00)
-  %tmp430 = call float @llvm.amdgcn.rsq.f32(float %tmp429)
-  %tmp431 = fmul float %tmp430, %tmp429
-  %tmp432 = fsub float -0.000000e+00, %tmp429
+  %max.0.i15 = call float @llvm.maxnum.f32(float %tmp428, float 0.000000e+00)
+  %clamp.i16 = call float @llvm.minnum.f32(float %max.0.i15, float 1.000000e+00)
+  %tmp430 = call float @llvm.amdgcn.rsq.f32(float %clamp.i16)
+  %tmp431 = fmul float %tmp430, %clamp.i16
+  %tmp432 = fsub float -0.000000e+00, %clamp.i16
   %cmp = fcmp ogt float 0.000000e+00, %tmp432
   %tmp433 = select i1 %cmp, float %tmp431, float 0.000000e+00
-  %tmp434 = fmul float %tmp183, %tmp421
-  %tmp435 = fmul float %tmp184, %tmp421
-  %tmp436 = fmul float %tmp185, %tmp421
-  %tmp437 = fmul float %tmp186, %tmp423
+  %tmp434 = fmul float %p2.i72, %tmp421
+  %tmp435 = fmul float %p2.i66, %tmp421
+  %tmp436 = fmul float %p2.i60, %tmp421
+  %tmp437 = fmul float %p2.i54, %tmp423
   %tmp438 = fadd float %tmp437, %tmp434
-  %tmp439 = fmul float %tmp187, %tmp423
+  %tmp439 = fmul float %p2.i48, %tmp423
   %tmp440 = fadd float %tmp439, %tmp435
-  %tmp441 = fmul float %tmp188, %tmp423
+  %tmp441 = fmul float %p2.i42, %tmp423
   %tmp442 = fadd float %tmp441, %tmp436
-  %tmp443 = fmul float %tmp189, %tmp433
+  %tmp443 = fmul float %p2.i36, %tmp433
   %tmp444 = fadd float %tmp443, %tmp438
-  %tmp445 = fmul float %tmp190, %tmp433
+  %tmp445 = fmul float %p2.i30, %tmp433
   %tmp446 = fadd float %tmp445, %tmp440
-  %tmp447 = fmul float %tmp191, %tmp433
+  %tmp447 = fmul float %p2.i24, %tmp433
   %tmp448 = fadd float %tmp447, %tmp442
   %tmp449 = fmul float %tmp444, %tmp444
   %tmp450 = fmul float %tmp446, %tmp446
@@ -1174,7 +1447,8 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0
   %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1
   %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32>
-  %tmp470 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp469, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp469.bc = bitcast <2 x i32> %tmp469 to <2 x float>
+  %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp469.bc, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp471 = extractelement <4 x float> %tmp470, i32 0
   %tmp472 = extractelement <4 x float> %tmp470, i32 1
   %tmp473 = extractelement <4 x float> %tmp470, i32 2
@@ -1187,12 +1461,13 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp480 = fadd float %tmp479, %tmp40
   %tmp481 = fmul float %tmp474, %tmp41
   %tmp482 = fadd float %tmp481, %tmp42
-  %tmp483 = bitcast float %tmp171 to i32
-  %tmp484 = bitcast float %tmp172 to i32
+  %tmp483 = bitcast float %p2.i144 to i32
+  %tmp484 = bitcast float %p2.i138 to i32
   %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0
   %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1
   %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32>
-  %tmp487 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp486, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp486.bc = bitcast <2 x i32> %tmp486 to <2 x float>
+  %tmp487 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp486.bc, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp488 = extractelement <4 x float> %tmp487, i32 0
   %tmp489 = extractelement <4 x float> %tmp487, i32 1
   %tmp490 = extractelement <4 x float> %tmp487, i32 2
@@ -1204,11 +1479,11 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp496 = fmul float %tmp489, %tmp494
   %tmp497 = fmul float %tmp490, %tmp494
   %tmp498 = fmul float %tmp27, %tmp495
-  %tmp499 = fadd float %tmp498, %tmp192
+  %tmp499 = fadd float %tmp498, %p2.i18
   %tmp500 = fmul float %tmp28, %tmp496
-  %tmp501 = fadd float %tmp500, %tmp193
+  %tmp501 = fadd float %tmp500, %p2.i12
   %tmp502 = fmul float %tmp29, %tmp497
-  %tmp503 = fadd float %tmp502, %tmp194
+  %tmp503 = fadd float %tmp502, %p2.i6
   %tmp504 = fmul float %tmp499, %tmp482
   %tmp505 = fmul float %tmp501, %tmp482
   %tmp506 = fmul float %tmp503, %tmp482
@@ -1242,18 +1517,19 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp534 = fadd float %tmp533, %tmp532
   %tmp535 = fmul float %temp14.0, %tmp531
   %tmp536 = fadd float %tmp534, %tmp535
-  %tmp537 = call float @llvm.AMDGPU.clamp.f32(float %tmp536, float 0.000000e+00, float 1.000000e+00)
-  %tmp538 = fmul float %tmp364, %tmp537
-  %tmp539 = fmul float %tmp365, %tmp537
-  %tmp540 = fmul float %tmp366, %tmp537
+  %max.0.i13 = call float @llvm.maxnum.f32(float %tmp536, float 0.000000e+00)
+  %clamp.i14 = call float @llvm.minnum.f32(float %max.0.i13, float 1.000000e+00)
+  %tmp538 = fmul float %tmp364, %clamp.i14
+  %tmp539 = fmul float %tmp365, %clamp.i14
+  %tmp540 = fmul float %tmp366, %clamp.i14
   %tmp541 = fmul float %tmp538, %tmp68
   %tmp542 = fmul float %tmp539, %tmp69
   %tmp543 = fmul float %tmp540, %tmp70
-  %tmp544 = fsub float -0.000000e+00, %tmp163
+  %tmp544 = fsub float -0.000000e+00, %p2.i
   %tmp545 = fadd float %tmp96, %tmp544
-  %tmp546 = fsub float -0.000000e+00, %tmp164
+  %tmp546 = fsub float -0.000000e+00, %p2.i186
   %tmp547 = fadd float %tmp97, %tmp546
-  %tmp548 = fsub float -0.000000e+00, %tmp165
+  %tmp548 = fsub float -0.000000e+00, %p2.i180
   %tmp549 = fadd float %tmp98, %tmp548
   %tmp550 = fmul float %tmp545, %tmp545
   %tmp551 = fmul float %tmp547, %tmp547
@@ -1339,31 +1615,31 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %temp69.0 = phi float [ %tmp112, %ENDIF200 ], [ %.231, %ELSE214 ], [ %tmp108, %ELSE211 ]
   %temp70.0 = phi float [ %tmp113, %ENDIF200 ], [ %.232, %ELSE214 ], [ %tmp109, %ELSE211 ]
   %temp71.0 = phi float [ %tmp114, %ENDIF200 ], [ %.233, %ELSE214 ], [ %tmp110, %ELSE211 ]
-  %tmp602 = fmul float %tmp163, %tmp84
-  %tmp603 = fmul float %tmp164, %tmp85
+  %tmp602 = fmul float %p2.i, %tmp84
+  %tmp603 = fmul float %p2.i186, %tmp85
   %tmp604 = fadd float %tmp602, %tmp603
-  %tmp605 = fmul float %tmp165, %tmp86
+  %tmp605 = fmul float %p2.i180, %tmp86
   %tmp606 = fadd float %tmp604, %tmp605
-  %tmp607 = fmul float %tmp166, %tmp87
+  %tmp607 = fmul float %p2.i174, %tmp87
   %tmp608 = fadd float %tmp606, %tmp607
-  %tmp609 = fmul float %tmp163, %tmp88
-  %tmp610 = fmul float %tmp164, %tmp89
+  %tmp609 = fmul float %p2.i, %tmp88
+  %tmp610 = fmul float %p2.i186, %tmp89
   %tmp611 = fadd float %tmp609, %tmp610
-  %tmp612 = fmul float %tmp165, %tmp90
+  %tmp612 = fmul float %p2.i180, %tmp90
   %tmp613 = fadd float %tmp611, %tmp612
-  %tmp614 = fmul float %tmp166, %tmp91
+  %tmp614 = fmul float %p2.i174, %tmp91
   %tmp615 = fadd float %tmp613, %tmp614
-  %tmp616 = fmul float %tmp163, %tmp92
-  %tmp617 = fmul float %tmp164, %tmp93
+  %tmp616 = fmul float %p2.i, %tmp92
+  %tmp617 = fmul float %p2.i186, %tmp93
   %tmp618 = fadd float %tmp616, %tmp617
-  %tmp619 = fmul float %tmp165, %tmp94
+  %tmp619 = fmul float %p2.i180, %tmp94
   %tmp620 = fadd float %tmp618, %tmp619
-  %tmp621 = fmul float %tmp166, %tmp95
+  %tmp621 = fmul float %p2.i174, %tmp95
   %tmp622 = fadd float %tmp620, %tmp621
   %tmp623 = fsub float -0.000000e+00, %tmp77
   %tmp624 = fadd float 1.000000e+00, %tmp623
-  %tmp625 = call float @fabs(float %tmp608)
-  %tmp626 = call float @fabs(float %tmp615)
+  %tmp625 = call float @llvm.fabs.f32(float %tmp608)
+  %tmp626 = call float @llvm.fabs.f32(float %tmp615)
   %tmp627 = fcmp oge float %tmp624, %tmp625
   %tmp628 = sext i1 %tmp627 to i32
   %tmp629 = bitcast i32 %tmp628 to float
@@ -1389,7 +1665,8 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp649 = fadd float %temp80.0, -1.000000e+00
   %tmp650 = fmul float %tmp649, %tmp76
   %tmp651 = fadd float %tmp650, 1.000000e+00
-  %tmp652 = call float @llvm.AMDGPU.clamp.f32(float %tmp651, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i11 = call float @llvm.maxnum.f32(float %tmp651, float 0.000000e+00)
+  %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00)
   %tmp653 = bitcast float %tmp642 to i32
   %tmp654 = bitcast float %tmp644 to i32
   %tmp655 = bitcast float 0.000000e+00 to i32
@@ -1398,7 +1675,8 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2
   %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3
   %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32>
-  %tmp660 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp659, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp659.bc = bitcast <4 x i32> %tmp659 to <4 x float>
+  %tmp660 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp659.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp661 = extractelement <4 x float> %tmp660, i32 0
   %tmp662 = extractelement <4 x float> %tmp660, i32 1
   %tmp663 = bitcast float %tmp646 to i32
@@ -1408,7 +1686,8 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp667 = insertelement <4 x i32> %tmp666, i32 %tmp664, i32 1
   %tmp668 = insertelement <4 x i32> %tmp667, i32 %tmp665, i32 2
   %tmp669 = insertelement <4 x i32> %tmp668, i32 undef, i32 3
-  %tmp670 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp669, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp669.bc = bitcast <4 x i32> %tmp669 to <4 x float>
+  %tmp670 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp669.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp671 = extractelement <4 x float> %tmp670, i32 0
   %tmp672 = extractelement <4 x float> %tmp670, i32 1
   %tmp673 = fsub float -0.000000e+00, %tmp662
@@ -1425,11 +1704,13 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp684 = fadd float %tmp683, %temp89.0
   %tmp685 = fmul float %tmp640, %temp90.0
   %tmp686 = fadd float %tmp685, %temp91.0
-  %tmp687 = call float @llvm.AMDGPU.clamp.f32(float %tmp684, float 0.000000e+00, float 1.000000e+00)
-  %tmp688 = call float @llvm.AMDGPU.clamp.f32(float %tmp686, float 0.000000e+00, float 1.000000e+00)
-  %tmp689 = fsub float -0.000000e+00, %tmp687
+  %max.0.i9 = call float @llvm.maxnum.f32(float %tmp684, float 0.000000e+00)
+  %clamp.i10 = call float @llvm.minnum.f32(float %max.0.i9, float 1.000000e+00)
+  %max.0.i7 = call float @llvm.maxnum.f32(float %tmp686, float 0.000000e+00)
+  %clamp.i8 = call float @llvm.minnum.f32(float %max.0.i7, float 1.000000e+00)
+  %tmp689 = fsub float -0.000000e+00, %clamp.i10
   %tmp690 = fadd float %tmp661, %tmp689
-  %tmp691 = fsub float -0.000000e+00, %tmp688
+  %tmp691 = fsub float -0.000000e+00, %clamp.i8
   %tmp692 = fadd float %tmp671, %tmp691
   %tmp693 = fmul float %tmp661, %tmp661
   %tmp694 = fmul float %tmp671, %tmp671
@@ -1461,16 +1742,17 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp719 = bitcast float %tmp718 to i32
   %tmp720 = icmp ne i32 %tmp719, 0
   %temp28.0 = select i1 %tmp720, float 1.000000e+00, float %tmp710
-  %one.sub.a.i25 = fsub float 1.000000e+00, %tmp652
+  %one.sub.a.i25 = fsub float 1.000000e+00, %clamp.i12
   %one.sub.ac.i26 = fmul float %one.sub.a.i25, %.229
   %mul.i27 = fmul float %temp28.0, %.229
   %result.i28 = fadd float %mul.i27, %one.sub.ac.i26
   %tmp721 = call float @llvm.pow.f32(float %result.i28, float %tmp75)
   %tmp722 = fmul float %tmp721, %tmp78
   %tmp723 = fadd float %tmp722, %tmp79
-  %tmp724 = call float @llvm.AMDGPU.clamp.f32(float %tmp723, float 0.000000e+00, float 1.000000e+00)
-  %tmp725 = fmul float %tmp724, %tmp724
-  %tmp726 = fmul float 2.000000e+00, %tmp724
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp723, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
+  %tmp725 = fmul float %clamp.i6, %clamp.i6
+  %tmp726 = fmul float 2.000000e+00, %clamp.i6
   %tmp727 = fsub float -0.000000e+00, %tmp726
   %tmp728 = fadd float 3.000000e+00, %tmp727
   %tmp729 = fmul float %tmp725, %tmp728
@@ -1504,12 +1786,13 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp747 = fadd float %tmp746, %tmp745
   %tmp748 = fmul float %temp14.0, %tmp217
   %tmp749 = fadd float %tmp747, %tmp748
-  %tmp750 = call float @fabs(float %tmp749)
+  %tmp750 = call float @llvm.fabs.f32(float %tmp749)
   %tmp751 = fmul float %tmp750, %tmp750
   %tmp752 = fmul float %tmp751, %tmp49
   %tmp753 = fadd float %tmp752, %tmp50
-  %tmp754 = call float @llvm.AMDGPU.clamp.f32(float %tmp753, float 0.000000e+00, float 1.000000e+00)
-  %tmp755 = fsub float -0.000000e+00, %tmp754
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp753, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %tmp755 = fsub float -0.000000e+00, %clamp.i4
   %tmp756 = fadd float 1.000000e+00, %tmp755
   %tmp757 = fmul float %tmp32, %tmp756
   %tmp758 = fmul float %tmp32, %tmp756
@@ -1545,12 +1828,11 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp772 = select i1 %tmp771, float 6.550400e+04, float %tmp766
   %tmp773 = fmul float %result.i2, %tmp51
   %tmp774 = fadd float %tmp773, %tmp52
-  %tmp775 = call float @llvm.AMDGPU.clamp.f32(float %tmp774, float 0.000000e+00, float 1.000000e+00)
-  %tmp776 = call i32 @llvm.SI.packf16(float %tmp768, float %tmp770)
-  %tmp777 = bitcast i32 %tmp776 to float
-  %tmp778 = call i32 @llvm.SI.packf16(float %tmp772, float %tmp775)
-  %tmp779 = bitcast i32 %tmp778 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp777, float %tmp779, float %tmp777, float %tmp779)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp774, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp776 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp768, float %tmp770)
+  %tmp778 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp772, float %clamp.i2)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp776, <2 x half> %tmp778, i1 true, i1 true) #0
   ret void
 
 ELSE214:                                          ; preds = %ELSE211
@@ -1566,57 +1848,32 @@ ELSE214:                                          ; preds = %ELSE211
   br label %ENDIF209
 }
 
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-
-
-declare float @llvm.exp2.f32(float) #2
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #2
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #2
-
+declare float @llvm.exp2.f32(float) #1
+declare float @llvm.ceil.f32(float) #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.pow.f32(float, float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.rsq.f32(float) #1
+declare float @llvm.amdgcn.cubeid(float, float, float) #1
+declare float @llvm.amdgcn.cubesc(float, float, float) #1
+declare float @llvm.amdgcn.cubetc(float, float, float) #1
+declare float @llvm.amdgcn.cubema(float, float, float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
-; Function Attrs: nounwind readonly
-declare float @ceil(float) #3
-
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.rsq.f32(float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
-
-; Function Attrs: readnone
-declare float @fabs(float) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-
-
-; Function Attrs: nounwind readnone
-declare float @llvm.pow.f32(float, float) #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #2
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #1 = { readnone }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readonly }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll
index 06f9277080a8..926702645d9e 100644
--- a/test/CodeGen/AMDGPU/si-spill-cf.ll
+++ b/test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -6,270 +6,271 @@
 
 ; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
 ; SI-NOT: v_readlane_b32 [[SAVED]]
+
 define amdgpu_ps void @main() #0 {
 main_body:
-  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
-  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
-  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
-  %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
-  %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
-  %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
-  %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
-  %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
-  %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
-  %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
-  %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
-  %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
-  %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
-  %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
-  %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
-  %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
-  %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
-  %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
-  %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
-  %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
-  %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
-  %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
-  %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
-  %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
-  %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
-  %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
-  %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
-  %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
-  %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
-  %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
-  %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
-  %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
-  %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
-  %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
-  %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
-  %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
-  %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
-  %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
-  %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
-  %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
-  %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
-  %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
-  %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
-  %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
-  %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
-  %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
-  %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
-  %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
-  %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
-  %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
-  %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
-  %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
-  %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
-  %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
-  %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
-  %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
-  %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
-  %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
-  %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
-  %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
-  %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
-  %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
-  %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
-  %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
-  %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
-  %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
-  %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
+  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
+  %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
+  %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
+  %tmp3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
+  %tmp4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
+  %tmp5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %tmp6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
+  %tmp7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
+  %tmp8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
+  %tmp9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
+  %tmp10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
+  %tmp11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
+  %tmp12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
+  %tmp13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
+  %tmp14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
+  %tmp15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
+  %tmp16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
+  %tmp17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
+  %tmp18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
+  %tmp19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
+  %tmp20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
+  %tmp22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
+  %tmp24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
+  %tmp25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
+  %tmp26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
+  %tmp27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
+  %tmp28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
+  %tmp29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
+  %tmp30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
+  %tmp31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
+  %tmp32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
+  %tmp33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
+  %tmp34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
+  %tmp35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
+  %tmp36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
+  %tmp37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
+  %tmp38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
+  %tmp39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
+  %tmp40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
+  %tmp41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
+  %tmp42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
+  %tmp43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
+  %tmp44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
+  %tmp45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
+  %tmp46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
+  %tmp47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
+  %tmp48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
+  %tmp49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
+  %tmp50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
+  %tmp51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
+  %tmp52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
+  %tmp53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
+  %tmp54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
+  %tmp55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
+  %tmp56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
+  %tmp57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
+  %tmp58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
+  %tmp59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
+  %tmp60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
+  %tmp61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
+  %tmp62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
+  %tmp63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
+  %tmp64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
+  %tmp65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
+  %tmp66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
   br label %LOOP
 
 LOOP:                                             ; preds = %ENDIF2795, %main_body
   %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
   %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-  %67 = icmp sgt i32 %tid, 4
-  br i1 %67, label %ENDLOOP, label %ENDIF
+  %tmp67 = icmp sgt i32 %tid, 4
+  br i1 %tmp67, label %ENDLOOP, label %ENDIF
 
 ENDLOOP:                                          ; preds = %ELSE2566, %LOOP
-  %one.sub.a.i = fsub float 1.000000e+00, %0
+  %one.sub.a.i = fsub float 1.000000e+00, %tmp
   %one.sub.ac.i = fmul float %one.sub.a.i, undef
   %result.i = fadd float fmul (float undef, float undef), %one.sub.ac.i
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %result.i, float undef, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float %result.i, float undef, float 1.000000e+00, i1 true, i1 true) #0
   ret void
 
 ENDIF:                                            ; preds = %LOOP
-  %68 = fsub float %2, undef
-  %69 = fsub float %3, undef
-  %70 = fsub float %4, undef
-  %71 = fmul float %68, 0.000000e+00
-  %72 = fmul float %69, undef
-  %73 = fmul float %70, undef
-  %74 = fsub float %6, undef
-  %75 = fsub float %7, undef
-  %76 = fmul float %74, undef
-  %77 = fmul float %75, 0.000000e+00
-  %78 = call float @llvm.minnum.f32(float %73, float %77)
-  %79 = call float @llvm.maxnum.f32(float %71, float 0.000000e+00)
-  %80 = call float @llvm.maxnum.f32(float %72, float %76)
-  %81 = call float @llvm.maxnum.f32(float undef, float %78)
-  %82 = call float @llvm.minnum.f32(float %79, float %80)
-  %83 = call float @llvm.minnum.f32(float %82, float undef)
-  %84 = fsub float %14, undef
-  %85 = fsub float %15, undef
-  %86 = fsub float %16, undef
-  %87 = fmul float %84, undef
-  %88 = fmul float %85, undef
-  %89 = fmul float %86, undef
-  %90 = fsub float %17, undef
-  %91 = fsub float %18, undef
-  %92 = fsub float %19, undef
-  %93 = fmul float %90, 0.000000e+00
-  %94 = fmul float %91, undef
-  %95 = fmul float %92, undef
-  %96 = call float @llvm.minnum.f32(float %88, float %94)
-  %97 = call float @llvm.maxnum.f32(float %87, float %93)
-  %98 = call float @llvm.maxnum.f32(float %89, float %95)
-  %99 = call float @llvm.maxnum.f32(float undef, float %96)
-  %100 = call float @llvm.maxnum.f32(float %99, float undef)
-  %101 = call float @llvm.minnum.f32(float %97, float undef)
-  %102 = call float @llvm.minnum.f32(float %101, float %98)
-  %103 = fsub float %30, undef
-  %104 = fsub float %31, undef
-  %105 = fmul float %103, 0.000000e+00
-  %106 = fmul float %104, 0.000000e+00
-  %107 = call float @llvm.minnum.f32(float undef, float %105)
-  %108 = call float @llvm.maxnum.f32(float undef, float %106)
-  %109 = call float @llvm.maxnum.f32(float undef, float %107)
-  %110 = call float @llvm.maxnum.f32(float %109, float undef)
-  %111 = call float @llvm.minnum.f32(float undef, float %108)
-  %112 = fsub float %32, undef
-  %113 = fsub float %33, undef
-  %114 = fsub float %34, undef
-  %115 = fmul float %112, 0.000000e+00
-  %116 = fmul float %113, undef
-  %117 = fmul float %114, undef
-  %118 = fsub float %35, undef
-  %119 = fsub float %36, undef
-  %120 = fsub float %37, undef
-  %121 = fmul float %118, undef
-  %122 = fmul float %119, undef
-  %123 = fmul float %120, undef
-  %124 = call float @llvm.minnum.f32(float %115, float %121)
-  %125 = call float @llvm.minnum.f32(float %116, float %122)
-  %126 = call float @llvm.minnum.f32(float %117, float %123)
-  %127 = call float @llvm.maxnum.f32(float %124, float %125)
-  %128 = call float @llvm.maxnum.f32(float %127, float %126)
-  %129 = fsub float %38, undef
-  %130 = fsub float %39, undef
-  %131 = fsub float %40, undef
-  %132 = fmul float %129, 0.000000e+00
-  %133 = fmul float %130, undef
-  %134 = fmul float %131, undef
-  %135 = fsub float %41, undef
-  %136 = fsub float %42, undef
-  %137 = fsub float %43, undef
-  %138 = fmul float %135, undef
-  %139 = fmul float %136, undef
-  %140 = fmul float %137, undef
-  %141 = call float @llvm.minnum.f32(float %132, float %138)
-  %142 = call float @llvm.minnum.f32(float %133, float %139)
-  %143 = call float @llvm.minnum.f32(float %134, float %140)
-  %144 = call float @llvm.maxnum.f32(float %141, float %142)
-  %145 = call float @llvm.maxnum.f32(float %144, float %143)
-  %146 = fsub float %44, undef
-  %147 = fsub float %45, undef
-  %148 = fsub float %46, undef
-  %149 = fmul float %146, 0.000000e+00
-  %150 = fmul float %147, 0.000000e+00
-  %151 = fmul float %148, undef
-  %152 = fsub float %47, undef
-  %153 = fsub float %48, undef
-  %154 = fsub float %49, undef
-  %155 = fmul float %152, undef
-  %156 = fmul float %153, 0.000000e+00
-  %157 = fmul float %154, undef
-  %158 = call float @llvm.minnum.f32(float %149, float %155)
-  %159 = call float @llvm.minnum.f32(float %150, float %156)
-  %160 = call float @llvm.minnum.f32(float %151, float %157)
-  %161 = call float @llvm.maxnum.f32(float %158, float %159)
-  %162 = call float @llvm.maxnum.f32(float %161, float %160)
-  %163 = fsub float %50, undef
-  %164 = fsub float %51, undef
-  %165 = fsub float %52, undef
-  %166 = fmul float %163, undef
-  %167 = fmul float %164, 0.000000e+00
-  %168 = fmul float %165, 0.000000e+00
-  %169 = fsub float %53, undef
-  %170 = fsub float %54, undef
-  %171 = fsub float %55, undef
-  %172 = fdiv float 1.000000e+00, %temp18.0
-  %173 = fmul float %169, undef
-  %174 = fmul float %170, undef
-  %175 = fmul float %171, %172
-  %176 = call float @llvm.minnum.f32(float %166, float %173)
-  %177 = call float @llvm.minnum.f32(float %167, float %174)
-  %178 = call float @llvm.minnum.f32(float %168, float %175)
-  %179 = call float @llvm.maxnum.f32(float %176, float %177)
-  %180 = call float @llvm.maxnum.f32(float %179, float %178)
-  %181 = fsub float %62, undef
-  %182 = fsub float %63, undef
-  %183 = fsub float %64, undef
-  %184 = fmul float %181, 0.000000e+00
-  %185 = fmul float %182, undef
-  %186 = fmul float %183, undef
-  %187 = fsub float %65, undef
-  %188 = fsub float %66, undef
-  %189 = fmul float %187, undef
-  %190 = fmul float %188, undef
-  %191 = call float @llvm.maxnum.f32(float %184, float %189)
-  %192 = call float @llvm.maxnum.f32(float %185, float %190)
-  %193 = call float @llvm.maxnum.f32(float %186, float undef)
-  %194 = call float @llvm.minnum.f32(float %191, float %192)
-  %195 = call float @llvm.minnum.f32(float %194, float %193)
-  %.temp292.7 = select i1 undef, float %162, float undef
-  %temp292.9 = select i1 false, float %180, float %.temp292.7
+  %tmp68 = fsub float %tmp2, undef
+  %tmp69 = fsub float %tmp3, undef
+  %tmp70 = fsub float %tmp4, undef
+  %tmp71 = fmul float %tmp68, 0.000000e+00
+  %tmp72 = fmul float %tmp69, undef
+  %tmp73 = fmul float %tmp70, undef
+  %tmp74 = fsub float %tmp6, undef
+  %tmp75 = fsub float %tmp7, undef
+  %tmp76 = fmul float %tmp74, undef
+  %tmp77 = fmul float %tmp75, 0.000000e+00
+  %tmp78 = call float @llvm.minnum.f32(float %tmp73, float %tmp77)
+  %tmp79 = call float @llvm.maxnum.f32(float %tmp71, float 0.000000e+00)
+  %tmp80 = call float @llvm.maxnum.f32(float %tmp72, float %tmp76)
+  %tmp81 = call float @llvm.maxnum.f32(float undef, float %tmp78)
+  %tmp82 = call float @llvm.minnum.f32(float %tmp79, float %tmp80)
+  %tmp83 = call float @llvm.minnum.f32(float %tmp82, float undef)
+  %tmp84 = fsub float %tmp14, undef
+  %tmp85 = fsub float %tmp15, undef
+  %tmp86 = fsub float %tmp16, undef
+  %tmp87 = fmul float %tmp84, undef
+  %tmp88 = fmul float %tmp85, undef
+  %tmp89 = fmul float %tmp86, undef
+  %tmp90 = fsub float %tmp17, undef
+  %tmp91 = fsub float %tmp18, undef
+  %tmp92 = fsub float %tmp19, undef
+  %tmp93 = fmul float %tmp90, 0.000000e+00
+  %tmp94 = fmul float %tmp91, undef
+  %tmp95 = fmul float %tmp92, undef
+  %tmp96 = call float @llvm.minnum.f32(float %tmp88, float %tmp94)
+  %tmp97 = call float @llvm.maxnum.f32(float %tmp87, float %tmp93)
+  %tmp98 = call float @llvm.maxnum.f32(float %tmp89, float %tmp95)
+  %tmp99 = call float @llvm.maxnum.f32(float undef, float %tmp96)
+  %tmp100 = call float @llvm.maxnum.f32(float %tmp99, float undef)
+  %tmp101 = call float @llvm.minnum.f32(float %tmp97, float undef)
+  %tmp102 = call float @llvm.minnum.f32(float %tmp101, float %tmp98)
+  %tmp103 = fsub float %tmp30, undef
+  %tmp104 = fsub float %tmp31, undef
+  %tmp105 = fmul float %tmp103, 0.000000e+00
+  %tmp106 = fmul float %tmp104, 0.000000e+00
+  %tmp107 = call float @llvm.minnum.f32(float undef, float %tmp105)
+  %tmp108 = call float @llvm.maxnum.f32(float undef, float %tmp106)
+  %tmp109 = call float @llvm.maxnum.f32(float undef, float %tmp107)
+  %tmp110 = call float @llvm.maxnum.f32(float %tmp109, float undef)
+  %tmp111 = call float @llvm.minnum.f32(float undef, float %tmp108)
+  %tmp112 = fsub float %tmp32, undef
+  %tmp113 = fsub float %tmp33, undef
+  %tmp114 = fsub float %tmp34, undef
+  %tmp115 = fmul float %tmp112, 0.000000e+00
+  %tmp116 = fmul float %tmp113, undef
+  %tmp117 = fmul float %tmp114, undef
+  %tmp118 = fsub float %tmp35, undef
+  %tmp119 = fsub float %tmp36, undef
+  %tmp120 = fsub float %tmp37, undef
+  %tmp121 = fmul float %tmp118, undef
+  %tmp122 = fmul float %tmp119, undef
+  %tmp123 = fmul float %tmp120, undef
+  %tmp124 = call float @llvm.minnum.f32(float %tmp115, float %tmp121)
+  %tmp125 = call float @llvm.minnum.f32(float %tmp116, float %tmp122)
+  %tmp126 = call float @llvm.minnum.f32(float %tmp117, float %tmp123)
+  %tmp127 = call float @llvm.maxnum.f32(float %tmp124, float %tmp125)
+  %tmp128 = call float @llvm.maxnum.f32(float %tmp127, float %tmp126)
+  %tmp129 = fsub float %tmp38, undef
+  %tmp130 = fsub float %tmp39, undef
+  %tmp131 = fsub float %tmp40, undef
+  %tmp132 = fmul float %tmp129, 0.000000e+00
+  %tmp133 = fmul float %tmp130, undef
+  %tmp134 = fmul float %tmp131, undef
+  %tmp135 = fsub float %tmp41, undef
+  %tmp136 = fsub float %tmp42, undef
+  %tmp137 = fsub float %tmp43, undef
+  %tmp138 = fmul float %tmp135, undef
+  %tmp139 = fmul float %tmp136, undef
+  %tmp140 = fmul float %tmp137, undef
+  %tmp141 = call float @llvm.minnum.f32(float %tmp132, float %tmp138)
+  %tmp142 = call float @llvm.minnum.f32(float %tmp133, float %tmp139)
+  %tmp143 = call float @llvm.minnum.f32(float %tmp134, float %tmp140)
+  %tmp144 = call float @llvm.maxnum.f32(float %tmp141, float %tmp142)
+  %tmp145 = call float @llvm.maxnum.f32(float %tmp144, float %tmp143)
+  %tmp146 = fsub float %tmp44, undef
+  %tmp147 = fsub float %tmp45, undef
+  %tmp148 = fsub float %tmp46, undef
+  %tmp149 = fmul float %tmp146, 0.000000e+00
+  %tmp150 = fmul float %tmp147, 0.000000e+00
+  %tmp151 = fmul float %tmp148, undef
+  %tmp152 = fsub float %tmp47, undef
+  %tmp153 = fsub float %tmp48, undef
+  %tmp154 = fsub float %tmp49, undef
+  %tmp155 = fmul float %tmp152, undef
+  %tmp156 = fmul float %tmp153, 0.000000e+00
+  %tmp157 = fmul float %tmp154, undef
+  %tmp158 = call float @llvm.minnum.f32(float %tmp149, float %tmp155)
+  %tmp159 = call float @llvm.minnum.f32(float %tmp150, float %tmp156)
+  %tmp160 = call float @llvm.minnum.f32(float %tmp151, float %tmp157)
+  %tmp161 = call float @llvm.maxnum.f32(float %tmp158, float %tmp159)
+  %tmp162 = call float @llvm.maxnum.f32(float %tmp161, float %tmp160)
+  %tmp163 = fsub float %tmp50, undef
+  %tmp164 = fsub float %tmp51, undef
+  %tmp165 = fsub float %tmp52, undef
+  %tmp166 = fmul float %tmp163, undef
+  %tmp167 = fmul float %tmp164, 0.000000e+00
+  %tmp168 = fmul float %tmp165, 0.000000e+00
+  %tmp169 = fsub float %tmp53, undef
+  %tmp170 = fsub float %tmp54, undef
+  %tmp171 = fsub float %tmp55, undef
+  %tmp172 = fdiv float 1.000000e+00, %temp18.0
+  %tmp173 = fmul float %tmp169, undef
+  %tmp174 = fmul float %tmp170, undef
+  %tmp175 = fmul float %tmp171, %tmp172
+  %tmp176 = call float @llvm.minnum.f32(float %tmp166, float %tmp173)
+  %tmp177 = call float @llvm.minnum.f32(float %tmp167, float %tmp174)
+  %tmp178 = call float @llvm.minnum.f32(float %tmp168, float %tmp175)
+  %tmp179 = call float @llvm.maxnum.f32(float %tmp176, float %tmp177)
+  %tmp180 = call float @llvm.maxnum.f32(float %tmp179, float %tmp178)
+  %tmp181 = fsub float %tmp62, undef
+  %tmp182 = fsub float %tmp63, undef
+  %tmp183 = fsub float %tmp64, undef
+  %tmp184 = fmul float %tmp181, 0.000000e+00
+  %tmp185 = fmul float %tmp182, undef
+  %tmp186 = fmul float %tmp183, undef
+  %tmp187 = fsub float %tmp65, undef
+  %tmp188 = fsub float %tmp66, undef
+  %tmp189 = fmul float %tmp187, undef
+  %tmp190 = fmul float %tmp188, undef
+  %tmp191 = call float @llvm.maxnum.f32(float %tmp184, float %tmp189)
+  %tmp192 = call float @llvm.maxnum.f32(float %tmp185, float %tmp190)
+  %tmp193 = call float @llvm.maxnum.f32(float %tmp186, float undef)
+  %tmp194 = call float @llvm.minnum.f32(float %tmp191, float %tmp192)
+  %tmp195 = call float @llvm.minnum.f32(float %tmp194, float %tmp193)
+  %.temp292.7 = select i1 undef, float %tmp162, float undef
+  %temp292.9 = select i1 false, float %tmp180, float %.temp292.7
   %.temp292.9 = select i1 undef, float undef, float %temp292.9
-  %196 = fcmp ogt float undef, 0.000000e+00
-  %197 = fcmp olt float undef, %195
-  %198 = and i1 %196, %197
-  %199 = fcmp olt float undef, %.temp292.9
-  %200 = and i1 %198, %199
-  %temp292.11 = select i1 %200, float undef, float %.temp292.9
-  %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tmp196 = fcmp ogt float undef, 0.000000e+00
+  %tmp197 = fcmp olt float undef, %tmp195
+  %tmp198 = and i1 %tmp196, %tmp197
+  %tmp199 = fcmp olt float undef, %.temp292.9
+  %tmp200 = and i1 %tmp198, %tmp199
+  %temp292.11 = select i1 %tmp200, float undef, float %.temp292.9
+  %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %cmp0 = icmp eq i32 %tid0, 0
   br i1 %cmp0, label %IF2565, label %ELSE2566
 
 IF2565:                                           ; preds = %ENDIF
-  %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %cmp1 = icmp eq i32 %tid1, 0
   br i1 %cmp1, label %ENDIF2582, label %ELSE2584
 
 ELSE2566:                                         ; preds = %ENDIF
-  %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tidf = bitcast i32 %tid2 to float
-  %201 = fcmp oeq float %temp292.11, %tidf
-  br i1 %201, label %ENDLOOP, label %ELSE2593
+  %tmp201 = fcmp oeq float %temp292.11, %tidf
+  br i1 %tmp201, label %ENDLOOP, label %ELSE2593
 
 ENDIF2564:                                        ; preds = %ENDIF2594, %ENDIF2588
   %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
-  %temp18.1 = phi float [ %218, %ENDIF2588 ], [ undef, %ENDIF2594 ]
-  %202 = fsub float %5, undef
-  %203 = fmul float %202, undef
-  %204 = call float @llvm.maxnum.f32(float undef, float %203)
-  %205 = call float @llvm.minnum.f32(float %204, float undef)
-  %206 = call float @llvm.minnum.f32(float %205, float undef)
-  %207 = fcmp ogt float undef, 0.000000e+00
-  %208 = fcmp olt float undef, 1.000000e+00
-  %209 = and i1 %207, %208
-  %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %temp18.1 = phi float [ %tmp218, %ENDIF2588 ], [ undef, %ENDIF2594 ]
+  %tmp202 = fsub float %tmp5, undef
+  %tmp203 = fmul float %tmp202, undef
+  %tmp204 = call float @llvm.maxnum.f32(float undef, float %tmp203)
+  %tmp205 = call float @llvm.minnum.f32(float %tmp204, float undef)
+  %tmp206 = call float @llvm.minnum.f32(float %tmp205, float undef)
+  %tmp207 = fcmp ogt float undef, 0.000000e+00
+  %tmp208 = fcmp olt float undef, 1.000000e+00
+  %tmp209 = and i1 %tmp207, %tmp208
+  %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tidf3 = bitcast i32 %tid3 to float
-  %210 = fcmp olt float %tidf3, %206
-  %211 = and i1 %209, %210
-  br i1 %211, label %ENDIF2795, label %ELSE2797
+  %tmp210 = fcmp olt float %tidf3, %tmp206
+  %tmp211 = and i1 %tmp209, %tmp210
+  br i1 %tmp211, label %ENDIF2795, label %ELSE2797
 
 ELSE2584:                                         ; preds = %IF2565
   br label %ENDIF2582
 
 ENDIF2582:                                        ; preds = %ELSE2584, %IF2565
-  %212 = fadd float %1, undef
-  %213 = fadd float 0.000000e+00, %212
-  %floor = call float @llvm.floor.f32(float %213)
-  %214 = fsub float %213, %floor
-  %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tmp212 = fadd float %tmp1, undef
+  %tmp213 = fadd float 0.000000e+00, %tmp212
+  %floor = call float @llvm.floor.f32(float %tmp213)
+  %tmp214 = fsub float %tmp213, %floor
+  %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %cmp4 = icmp eq i32 %tid4, 0
   br i1 %cmp4, label %IF2589, label %ELSE2590
 
@@ -280,61 +281,61 @@ ELSE2590:                                         ; preds = %ENDIF2582
   br label %ENDIF2588
 
 ENDIF2588:                                        ; preds = %ELSE2590, %IF2589
-  %215 = fsub float 1.000000e+00, %214
-  %216 = call float @llvm.sqrt.f32(float %215)
-  %217 = fmul float %216, undef
-  %218 = fadd float %217, undef
+  %tmp215 = fsub float 1.000000e+00, %tmp214
+  %tmp216 = call float @llvm.sqrt.f32(float %tmp215)
+  %tmp217 = fmul float %tmp216, undef
+  %tmp218 = fadd float %tmp217, undef
   br label %ENDIF2564
 
 ELSE2593:                                         ; preds = %ELSE2566
-  %219 = fcmp oeq float %temp292.11, %81
-  %220 = fcmp olt float %81, %83
-  %221 = and i1 %219, %220
-  br i1 %221, label %ENDIF2594, label %ELSE2596
+  %tmp219 = fcmp oeq float %temp292.11, %tmp81
+  %tmp220 = fcmp olt float %tmp81, %tmp83
+  %tmp221 = and i1 %tmp219, %tmp220
+  br i1 %tmp221, label %ENDIF2594, label %ELSE2596
 
 ELSE2596:                                         ; preds = %ELSE2593
-  %222 = fcmp oeq float %temp292.11, %100
-  %223 = fcmp olt float %100, %102
-  %224 = and i1 %222, %223
-  br i1 %224, label %ENDIF2594, label %ELSE2632
+  %tmp222 = fcmp oeq float %temp292.11, %tmp100
+  %tmp223 = fcmp olt float %tmp100, %tmp102
+  %tmp224 = and i1 %tmp222, %tmp223
+  br i1 %tmp224, label %ENDIF2594, label %ELSE2632
 
 ENDIF2594:                                        ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
   %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
-  %225 = fmul float %temp894.2, undef
+  %tmp225 = fmul float %temp894.2, undef
   br label %ENDIF2564
 
 ELSE2632:                                         ; preds = %ELSE2596
   br i1 undef, label %ENDIF2594, label %ELSE2650
 
 ELSE2650:                                         ; preds = %ELSE2632
-  %226 = fcmp oeq float %temp292.11, %110
-  %227 = fcmp olt float %110, %111
-  %228 = and i1 %226, %227
-  br i1 %228, label %IF2667, label %ELSE2668
+  %tmp226 = fcmp oeq float %temp292.11, %tmp110
+  %tmp227 = fcmp olt float %tmp110, %tmp111
+  %tmp228 = and i1 %tmp226, %tmp227
+  br i1 %tmp228, label %IF2667, label %ELSE2668
 
 IF2667:                                           ; preds = %ELSE2650
   br i1 undef, label %ENDIF2594, label %ELSE2671
 
 ELSE2668:                                         ; preds = %ELSE2650
-  %229 = fcmp oeq float %temp292.11, %128
-  %230 = fcmp olt float %128, undef
-  %231 = and i1 %229, %230
-  br i1 %231, label %ENDIF2594, label %ELSE2686
+  %tmp229 = fcmp oeq float %temp292.11, %tmp128
+  %tmp230 = fcmp olt float %tmp128, undef
+  %tmp231 = and i1 %tmp229, %tmp230
+  br i1 %tmp231, label %ENDIF2594, label %ELSE2686
 
 ELSE2671:                                         ; preds = %IF2667
   br label %ENDIF2594
 
 ELSE2686:                                         ; preds = %ELSE2668
-  %232 = fcmp oeq float %temp292.11, %145
-  %233 = fcmp olt float %145, undef
-  %234 = and i1 %232, %233
-  br i1 %234, label %ENDIF2594, label %ELSE2704
+  %tmp232 = fcmp oeq float %temp292.11, %tmp145
+  %tmp233 = fcmp olt float %tmp145, undef
+  %tmp234 = and i1 %tmp232, %tmp233
+  br i1 %tmp234, label %ENDIF2594, label %ELSE2704
 
 ELSE2704:                                         ; preds = %ELSE2686
-  %235 = fcmp oeq float %temp292.11, %180
-  %236 = fcmp olt float %180, undef
-  %237 = and i1 %235, %236
-  br i1 %237, label %ENDIF2594, label %ELSE2740
+  %tmp235 = fcmp oeq float %temp292.11, %tmp180
+  %tmp236 = fcmp olt float %tmp180, undef
+  %tmp237 = and i1 %tmp235, %tmp236
+  br i1 %tmp237, label %ENDIF2594, label %ELSE2740
 
 ELSE2740:                                         ; preds = %ELSE2704
   br i1 undef, label %IF2757, label %ELSE2758
@@ -349,8 +350,8 @@ ELSE2761:                                         ; preds = %IF2757
   br label %ENDIF2594
 
 IF2775:                                           ; preds = %ELSE2758
-  %238 = fcmp olt float undef, undef
-  br i1 %238, label %ENDIF2594, label %ELSE2779
+  %tmp238 = fcmp olt float undef, undef
+  br i1 %tmp238, label %ENDIF2594, label %ELSE2779
 
 ELSE2779:                                         ; preds = %IF2775
   br i1 undef, label %ENDIF2594, label %ELSE2782
@@ -359,39 +360,39 @@ ELSE2782:                                         ; preds = %ELSE2779
   br i1 undef, label %ENDIF2594, label %ELSE2785
 
 ELSE2785:                                         ; preds = %ELSE2782
-  %239 = fcmp olt float undef, 0.000000e+00
-  br i1 %239, label %ENDIF2594, label %ELSE2788
+  %tmp239 = fcmp olt float undef, 0.000000e+00
+  br i1 %tmp239, label %ENDIF2594, label %ELSE2788
 
 ELSE2788:                                         ; preds = %ELSE2785
-  %240 = fcmp olt float 0.000000e+00, undef
-  %.2848 = select i1 %240, float -1.000000e+00, float 1.000000e+00
+  %tmp240 = fcmp olt float 0.000000e+00, undef
+  %.2848 = select i1 %tmp240, float -1.000000e+00, float 1.000000e+00
   br label %ENDIF2594
 
 ELSE2797:                                         ; preds = %ENDIF2564
-  %241 = fsub float %8, undef
-  %242 = fsub float %9, undef
-  %243 = fsub float %10, undef
-  %244 = fmul float %241, undef
-  %245 = fmul float %242, undef
-  %246 = fmul float %243, undef
-  %247 = fsub float %11, undef
-  %248 = fsub float %12, undef
-  %249 = fsub float %13, undef
-  %250 = fmul float %247, undef
-  %251 = fmul float %248, undef
-  %252 = fmul float %249, undef
-  %253 = call float @llvm.minnum.f32(float %244, float %250)
-  %254 = call float @llvm.minnum.f32(float %245, float %251)
-  %255 = call float @llvm.maxnum.f32(float %246, float %252)
-  %256 = call float @llvm.maxnum.f32(float %253, float %254)
-  %257 = call float @llvm.maxnum.f32(float %256, float undef)
-  %258 = call float @llvm.minnum.f32(float undef, float %255)
-  %259 = fcmp ogt float %257, 0.000000e+00
-  %260 = fcmp olt float %257, 1.000000e+00
-  %261 = and i1 %259, %260
-  %262 = fcmp olt float %257, %258
-  %263 = and i1 %261, %262
-  br i1 %263, label %ENDIF2795, label %ELSE2800
+  %tmp241 = fsub float %tmp8, undef
+  %tmp242 = fsub float %tmp9, undef
+  %tmp243 = fsub float %tmp10, undef
+  %tmp244 = fmul float %tmp241, undef
+  %tmp245 = fmul float %tmp242, undef
+  %tmp246 = fmul float %tmp243, undef
+  %tmp247 = fsub float %tmp11, undef
+  %tmp248 = fsub float %tmp12, undef
+  %tmp249 = fsub float %tmp13, undef
+  %tmp250 = fmul float %tmp247, undef
+  %tmp251 = fmul float %tmp248, undef
+  %tmp252 = fmul float %tmp249, undef
+  %tmp253 = call float @llvm.minnum.f32(float %tmp244, float %tmp250)
+  %tmp254 = call float @llvm.minnum.f32(float %tmp245, float %tmp251)
+  %tmp255 = call float @llvm.maxnum.f32(float %tmp246, float %tmp252)
+  %tmp256 = call float @llvm.maxnum.f32(float %tmp253, float %tmp254)
+  %tmp257 = call float @llvm.maxnum.f32(float %tmp256, float undef)
+  %tmp258 = call float @llvm.minnum.f32(float undef, float %tmp255)
+  %tmp259 = fcmp ogt float %tmp257, 0.000000e+00
+  %tmp260 = fcmp olt float %tmp257, 1.000000e+00
+  %tmp261 = and i1 %tmp259, %tmp260
+  %tmp262 = fcmp olt float %tmp257, %tmp258
+  %tmp263 = and i1 %tmp261, %tmp262
+  br i1 %tmp263, label %ENDIF2795, label %ELSE2800
 
 ENDIF2795:                                        ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
   br label %LOOP
@@ -400,53 +401,53 @@ ELSE2800:                                         ; preds = %ELSE2797
   br i1 undef, label %ENDIF2795, label %ELSE2803
 
 ELSE2803:                                         ; preds = %ELSE2800
-  %264 = fsub float %20, undef
-  %265 = fsub float %21, undef
-  %266 = fsub float %22, undef
-  %267 = fmul float %264, undef
-  %268 = fmul float %265, undef
-  %269 = fmul float %266, 0.000000e+00
-  %270 = fsub float %23, undef
-  %271 = fsub float %24, undef
-  %272 = fsub float %25, undef
-  %273 = fmul float %270, undef
-  %274 = fmul float %271, undef
-  %275 = fmul float %272, undef
-  %276 = call float @llvm.minnum.f32(float %267, float %273)
-  %277 = call float @llvm.maxnum.f32(float %268, float %274)
-  %278 = call float @llvm.maxnum.f32(float %269, float %275)
-  %279 = call float @llvm.maxnum.f32(float %276, float undef)
-  %280 = call float @llvm.maxnum.f32(float %279, float undef)
-  %281 = call float @llvm.minnum.f32(float undef, float %277)
-  %282 = call float @llvm.minnum.f32(float %281, float %278)
-  %283 = fcmp ogt float %280, 0.000000e+00
-  %284 = fcmp olt float %280, 1.000000e+00
-  %285 = and i1 %283, %284
-  %286 = fcmp olt float %280, %282
-  %287 = and i1 %285, %286
-  br i1 %287, label %ENDIF2795, label %ELSE2806
+  %tmp264 = fsub float %tmp20, undef
+  %tmp265 = fsub float %tmp21, undef
+  %tmp266 = fsub float %tmp22, undef
+  %tmp267 = fmul float %tmp264, undef
+  %tmp268 = fmul float %tmp265, undef
+  %tmp269 = fmul float %tmp266, 0.000000e+00
+  %tmp270 = fsub float %tmp23, undef
+  %tmp271 = fsub float %tmp24, undef
+  %tmp272 = fsub float %tmp25, undef
+  %tmp273 = fmul float %tmp270, undef
+  %tmp274 = fmul float %tmp271, undef
+  %tmp275 = fmul float %tmp272, undef
+  %tmp276 = call float @llvm.minnum.f32(float %tmp267, float %tmp273)
+  %tmp277 = call float @llvm.maxnum.f32(float %tmp268, float %tmp274)
+  %tmp278 = call float @llvm.maxnum.f32(float %tmp269, float %tmp275)
+  %tmp279 = call float @llvm.maxnum.f32(float %tmp276, float undef)
+  %tmp280 = call float @llvm.maxnum.f32(float %tmp279, float undef)
+  %tmp281 = call float @llvm.minnum.f32(float undef, float %tmp277)
+  %tmp282 = call float @llvm.minnum.f32(float %tmp281, float %tmp278)
+  %tmp283 = fcmp ogt float %tmp280, 0.000000e+00
+  %tmp284 = fcmp olt float %tmp280, 1.000000e+00
+  %tmp285 = and i1 %tmp283, %tmp284
+  %tmp286 = fcmp olt float %tmp280, %tmp282
+  %tmp287 = and i1 %tmp285, %tmp286
+  br i1 %tmp287, label %ENDIF2795, label %ELSE2806
 
 ELSE2806:                                         ; preds = %ELSE2803
-  %288 = fsub float %26, undef
-  %289 = fsub float %27, undef
-  %290 = fsub float %28, undef
-  %291 = fmul float %288, undef
-  %292 = fmul float %289, 0.000000e+00
-  %293 = fmul float %290, undef
-  %294 = fsub float %29, undef
-  %295 = fmul float %294, undef
-  %296 = call float @llvm.minnum.f32(float %291, float %295)
-  %297 = call float @llvm.minnum.f32(float %292, float undef)
-  %298 = call float @llvm.maxnum.f32(float %293, float undef)
-  %299 = call float @llvm.maxnum.f32(float %296, float %297)
-  %300 = call float @llvm.maxnum.f32(float %299, float undef)
-  %301 = call float @llvm.minnum.f32(float undef, float %298)
-  %302 = fcmp ogt float %300, 0.000000e+00
-  %303 = fcmp olt float %300, 1.000000e+00
-  %304 = and i1 %302, %303
-  %305 = fcmp olt float %300, %301
-  %306 = and i1 %304, %305
-  br i1 %306, label %ENDIF2795, label %ELSE2809
+  %tmp288 = fsub float %tmp26, undef
+  %tmp289 = fsub float %tmp27, undef
+  %tmp290 = fsub float %tmp28, undef
+  %tmp291 = fmul float %tmp288, undef
+  %tmp292 = fmul float %tmp289, 0.000000e+00
+  %tmp293 = fmul float %tmp290, undef
+  %tmp294 = fsub float %tmp29, undef
+  %tmp295 = fmul float %tmp294, undef
+  %tmp296 = call float @llvm.minnum.f32(float %tmp291, float %tmp295)
+  %tmp297 = call float @llvm.minnum.f32(float %tmp292, float undef)
+  %tmp298 = call float @llvm.maxnum.f32(float %tmp293, float undef)
+  %tmp299 = call float @llvm.maxnum.f32(float %tmp296, float %tmp297)
+  %tmp300 = call float @llvm.maxnum.f32(float %tmp299, float undef)
+  %tmp301 = call float @llvm.minnum.f32(float undef, float %tmp298)
+  %tmp302 = fcmp ogt float %tmp300, 0.000000e+00
+  %tmp303 = fcmp olt float %tmp300, 1.000000e+00
+  %tmp304 = and i1 %tmp302, %tmp303
+  %tmp305 = fcmp olt float %tmp300, %tmp301
+  %tmp306 = and i1 %tmp304, %tmp305
+  br i1 %tmp306, label %ENDIF2795, label %ELSE2809
 
 ELSE2809:                                         ; preds = %ELSE2806
   br i1 undef, label %ENDIF2795, label %ELSE2812
@@ -461,53 +462,42 @@ ELSE2818:                                         ; preds = %ELSE2815
   br i1 undef, label %ENDIF2795, label %ELSE2821
 
 ELSE2821:                                         ; preds = %ELSE2818
-  %307 = fsub float %56, undef
-  %308 = fsub float %57, undef
-  %309 = fsub float %58, undef
-  %310 = fmul float %307, undef
-  %311 = fmul float %308, 0.000000e+00
-  %312 = fmul float %309, undef
-  %313 = fsub float %59, undef
-  %314 = fsub float %60, undef
-  %315 = fsub float %61, undef
-  %316 = fmul float %313, undef
-  %317 = fmul float %314, undef
-  %318 = fmul float %315, undef
-  %319 = call float @llvm.maxnum.f32(float %310, float %316)
-  %320 = call float @llvm.maxnum.f32(float %311, float %317)
-  %321 = call float @llvm.maxnum.f32(float %312, float %318)
-  %322 = call float @llvm.minnum.f32(float %319, float %320)
-  %323 = call float @llvm.minnum.f32(float %322, float %321)
-  %324 = fcmp ogt float undef, 0.000000e+00
-  %325 = fcmp olt float undef, 1.000000e+00
-  %326 = and i1 %324, %325
-  %327 = fcmp olt float undef, %323
-  %328 = and i1 %326, %327
-  br i1 %328, label %ENDIF2795, label %ELSE2824
+  %tmp307 = fsub float %tmp56, undef
+  %tmp308 = fsub float %tmp57, undef
+  %tmp309 = fsub float %tmp58, undef
+  %tmp310 = fmul float %tmp307, undef
+  %tmp311 = fmul float %tmp308, 0.000000e+00
+  %tmp312 = fmul float %tmp309, undef
+  %tmp313 = fsub float %tmp59, undef
+  %tmp314 = fsub float %tmp60, undef
+  %tmp315 = fsub float %tmp61, undef
+  %tmp316 = fmul float %tmp313, undef
+  %tmp317 = fmul float %tmp314, undef
+  %tmp318 = fmul float %tmp315, undef
+  %tmp319 = call float @llvm.maxnum.f32(float %tmp310, float %tmp316)
+  %tmp320 = call float @llvm.maxnum.f32(float %tmp311, float %tmp317)
+  %tmp321 = call float @llvm.maxnum.f32(float %tmp312, float %tmp318)
+  %tmp322 = call float @llvm.minnum.f32(float %tmp319, float %tmp320)
+  %tmp323 = call float @llvm.minnum.f32(float %tmp322, float %tmp321)
+  %tmp324 = fcmp ogt float undef, 0.000000e+00
+  %tmp325 = fcmp olt float undef, 1.000000e+00
+  %tmp326 = and i1 %tmp324, %tmp325
+  %tmp327 = fcmp olt float undef, %tmp323
+  %tmp328 = and i1 %tmp326, %tmp327
+  br i1 %tmp328, label %ENDIF2795, label %ELSE2824
 
 ELSE2824:                                         ; preds = %ELSE2821
   %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
   br label %ENDIF2795
 }
 
-declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.floor.f32(float) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.sqrt.f32(float) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.minnum.f32(float, float) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.maxnum.f32(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index 062f5245af10..114c97b61bd4 100644
--- a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -8,7 +8,7 @@
 ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000
 
 ; Make sure we are handling hazards correctly.
-; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
+; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16
 ; SGPR-NEXT: s_waitcnt vmcnt(0)
 ; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
 ; SGPR-NEXT: s_nop 4
@@ -16,15 +16,15 @@
 
 ; Make sure scratch wave offset register is correctly incremented and
 ; then restored.
-; SMEM: s_mov_b32 m0, s[[OFF]]{{$}}
+; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}}
 ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Spill
 
-; SMEM: s_mov_b32 m0, s[[OFF]]{{$}}
+; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}}
 ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Reload
 
 ; SMEM: s_dcache_wb
 ; ALL: s_endpgm
-define void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 4beefb047f22..8a4cee264fd8 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
 
 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
 declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2
 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; CI: buffer_store_dword
-define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -33,7 +33,7 @@ define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out,
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: buffer_store_dword
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
-define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -53,7 +53,7 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: buffer_store_dword
-define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -77,7 +77,7 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: buffer_store_dword
-define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -100,7 +100,7 @@ define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)*
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
-define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
+define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -122,7 +122,7 @@ define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %
 ; CI: s_load_dword
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
-define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
+define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
@@ -141,7 +141,7 @@ define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32
 ; CI: buffer_load_dword
 ; CI: buffer_load_dword
 ; CI: buffer_store_dword
-define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
+define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
 
@@ -157,12 +157,11 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 
 ; FUNC-LABEL: @reorder_local_offsets
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
-; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
+; CI-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100
+; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
-define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
+define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
   %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102
@@ -181,14 +180,14 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 }
 
 ; FUNC-LABEL: @reorder_global_offsets
-; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_store_dword
 ; CI: s_endpgm
-define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
+define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
   %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102
@@ -222,7 +221,7 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:36{{$}}
 ; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}}
-define void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
+define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
 
diff --git a/test/CodeGen/AMDGPU/si-vector-hang.ll b/test/CodeGen/AMDGPU/si-vector-hang.ll
index dd8783df5c3c..7990990478af 100644
--- a/test/CodeGen/AMDGPU/si-vector-hang.ll
+++ b/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -12,7 +12,7 @@
 ; CHECK: buffer_store_byte
 ; ModuleID = 'radeon'
 
-define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
+define amdgpu_kernel void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
 entry:
   %0 = load i8, i8 addrspace(1)* %in0, align 1
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll
index 875351c59961..3e452c214e98 100644
--- a/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}s_sext_i1_to_i32:
 ; GCN: v_cndmask_b32_e64
 ; GCN: s_endpgm
-define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -14,7 +14,7 @@ define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; GCN-LABEL: {{^}}test_s_sext_i32_to_i64:
 ; GCN: s_ashr_i32
 ; GCN: s_endpg
-define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 entry:
   %mul = mul i32 %a, %b
   %add = add i32 %mul, %c
@@ -28,7 +28,7 @@ entry:
 ; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
 ; GCN: s_endpgm
-define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
@@ -38,7 +38,7 @@ define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; GCN-LABEL: {{^}}s_sext_i32_to_i64:
 ; GCN: s_ashr_i32
 ; GCN: s_endpgm
-define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
   %sext = sext i32 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
@@ -47,7 +47,7 @@ define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN-LABEL: {{^}}v_sext_i32_to_i64:
 ; GCN: v_ashr
 ; GCN: s_endpgm
-define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %sext = sext i32 %val to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
@@ -56,7 +56,7 @@ define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) no
 
 ; GCN-LABEL: {{^}}s_sext_i16_to_i64:
 ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
-define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
+define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
   %sext = sext i16 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
@@ -65,7 +65,7 @@ define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
 ; GCN-LABEL: {{^}}s_sext_i1_to_i16:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i16
   store i16 %sext, i16 addrspace(1)* %out
@@ -79,7 +79,7 @@ define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
   %cmp0 = icmp eq i32 %a, %b
   %cmp1 = icmp eq i32 %c, %d
   %cmp = and i1 %cmp0, %cmp1
@@ -91,7 +91,7 @@ define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %cmp0 = icmp eq i32 %a, %tid
   %cmp1 = icmp eq i32 %b, %c
@@ -130,7 +130,7 @@ define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN-DAG: buffer_store_dword [[VEXT3]]
 
 ; GCN: s_endpgm
-define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cast = bitcast i32 %a to <4 x i8>
   %ext = sext <4 x i8> %cast to <4 x i32>
   %elt0 = extractelement <4 x i32> %ext, i32 0
@@ -162,7 +162,7 @@ define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN: buffer_store_dword [[EXT1]]
 ; GCN: buffer_store_dword [[EXT2]]
 ; GCN: buffer_store_dword [[EXT3]]
-define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32, i32 addrspace(1)* %in
   %cast = bitcast i32 %a to <4 x i8>
   %ext = sext <4 x i8> %cast to <4 x i32>
@@ -184,7 +184,7 @@ define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-DAG: s_sext_i32_i16
 ; GCN-DAG: s_sext_i32_i16
 ; GCN: s_endpgm
-define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
   %cast = bitcast i64 %a to <4 x i16>
   %ext = sext <4 x i16> %cast to <4 x i32>
   %elt0 = extractelement <4 x i32> %ext, i32 0
@@ -206,7 +206,7 @@ define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: s_endpgm
-define void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %a = load i64, i64 addrspace(1)* %in
   %cast = bitcast i64 %a to <4 x i16>
   %ext = sext <4 x i16> %cast to <4 x i32>
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 68dc3c6ccd24..f98a716b4fd1 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_i32_e32
-define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   %result = sitofp i32 %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = sitofp i1 %cmp to double
   store double %fp, double addrspace(1)* %out, align 4
@@ -31,14 +31,14 @@ define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
 ; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = sitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: @s_sint_to_fp_i64_to_f64
-define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %result = sitofp i64 %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -51,7 +51,7 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
 ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 5df8105116cc..04cd199b81ae 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -4,7 +4,7 @@
 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f16:
-define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
   %result = sitofp i64 %in to half
   store half %result, half addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
 ; GCN: v_cvt_f16_f32_e32 [[SIGN_SEL_F16:v[0-9]+]], [[SIGN_SEL]]
 ; GCN: {{buffer|flat}}_store_short {{.*}}[[SIGN_SEL_F16]]
-define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -39,7 +39,7 @@ define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f32:
-define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
   %result = sitofp i64 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -62,7 +62,7 @@ define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[SIGN_SEL]]
-define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -74,14 +74,14 @@ define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f32:
 ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1,
-define void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = sitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f32:
-define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -93,14 +93,14 @@ define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i6
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f16:
 ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1,
-define void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = sitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f16:
-define void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll
index 4c8fea12bada..8e85d9998597 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -6,7 +6,7 @@
 ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
 
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = sitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
 ; R600: INT_TO_FLT
-define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -32,7 +32,7 @@ define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)*
 
 ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{
   %result = sitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i3
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -81,7 +81,7 @@ define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrsp
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = sitofp i1 %in to float
   store float %fp, float addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@ define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
-define void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/sitofp.f16.ll b/test/CodeGen/AMDGPU/sitofp.f16.ll
index 1395fa2bfea0..574d1c0b2c78 100644
--- a/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sitofp_i16_to_f16(
+define amdgpu_kernel void @sitofp_i16_to_f16(
     half addrspace(1)* %r,
     i16 addrspace(1)* %a) {
 entry:
@@ -23,7 +23,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sitofp_i32_to_f16(
+define amdgpu_kernel void @sitofp_i32_to_f16(
     half addrspace(1)* %r,
     i32 addrspace(1)* %a) {
 entry:
@@ -37,15 +37,24 @@ entry:
 
 ; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16
 ; GCN:     buffer_load_dword
-; GCN:     v_cvt_f32_i32_e32
-; GCN:     v_cvt_f32_i32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
-; GCN:     buffer_store_dword
-; GCN:     s_endpgm
-define void @sitofp_v2i16_to_v2f16(
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI:     v_or_b32_e32
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+
+define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a) {
 entry:
@@ -56,17 +65,24 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}sitofp_v2i32_to_v2f16
-; GCN:     buffer_load_dwordx2
-; GCN:     v_cvt_f32_i32_e32
-; GCN:     v_cvt_f32_i32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN-DAG: v_and_b32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
-; GCN:     buffer_store_dword
-; GCN:     s_endpgm
-define void @sitofp_v2i32_to_v2f16(
+; GCN:    buffer_load_dwordx2
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f32_i32_e32
+; VI-DAG: v_cvt_f32_i32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI:     v_or_b32_e32
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i32> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll
index 60cee7a3499e..3f53572ab440 100644
--- a/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -357,7 +357,7 @@ bb7:                                              ; preds = %bb4
 ; CHECK: [[END]]:
 ; CHECK: s_or_b64 exec, exec
 ; CHECK: s_endpgm
-define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x i32> %arg2) #0 {
+define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x float> %arg2) #0 {
 bb:
   %tmp = fcmp ult float %arg1, 0.000000e+00
   br i1 %tmp, label %bb3, label %bb4
@@ -367,7 +367,7 @@ bb3:                                              ; preds = %bb
   br label %bb4
 
 bb4:                                              ; preds = %bb3, %bb
-  %tmp5 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp6 = extractelement <4 x float> %tmp5, i32 0
   %tmp7 = fcmp une float %tmp6, 0.000000e+00
   br i1 %tmp7, label %bb8, label %bb9
@@ -380,9 +380,8 @@ bb9:                                              ; preds = %bb4
   ret void
 }
 
+declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
 declare void @llvm.AMDGPU.kill(float) #0
-declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll
index 985c73904f43..8665ab697265 100644
--- a/test/CodeGen/AMDGPU/smed3.ll
+++ b/test/CodeGen/AMDGPU/smed3.ll
@@ -1,12 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
 ; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -24,8 +25,8 @@ define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
 ; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32:
 ; GCN: v_max_i32
 ; GCN: v_min_i32
-define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -44,8 +45,8 @@ define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -63,8 +64,8 @@ define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -82,8 +83,8 @@ define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64:
 ; GCN: v_cmp_lt_i64
 ; GCN: v_cmp_gt_i64
-define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
   %a = load i64, i64 addrspace(1)* %gep0
@@ -99,9 +100,10 @@ define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 }
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
-; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define amdgpu_kernel void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
   %a = load i16, i16 addrspace(1)* %gep0
@@ -172,7 +174,7 @@ define internal i8 @smax8(i8 %x, i8 %y) #2 {
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -184,7 +186,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_1:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -196,7 +198,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_2:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -208,7 +210,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_3:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -220,7 +222,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_4:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -232,7 +234,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_5:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -244,7 +246,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_6:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -256,7 +258,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_7:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -268,7 +270,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_8:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -280,7 +282,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_9:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -292,7 +294,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_10:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -304,7 +306,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_11:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -316,7 +318,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_12:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -328,7 +330,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_13:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -340,7 +342,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_14:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -352,7 +354,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_15:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -362,12 +364,13 @@ bb:
   ret void
 }
 
+; FIXME: Should keep scalar or not promote
 ; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0:
 ; GCN: s_sext_i32_i16
 ; GCN: s_sext_i32_i16
 ; GCN: s_sext_i32_i16
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
 bb:
   %tmp0 = call i16 @smin16(i16 %x, i16 %y)
   %tmp1 = call i16 @smax16(i16 %x, i16 %y)
@@ -382,7 +385,7 @@ bb:
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
 bb:
   %tmp0 = call i8 @smin8(i8 %x, i8 %y)
   %tmp1 = call i8 @smax8(i8 %x, i8 %y)
@@ -394,7 +397,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_0:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -407,7 +410,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_1:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -420,7 +423,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_2:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -433,7 +436,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_result:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -444,6 +447,35 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_smed3_i16_pat_0:
+; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; FIXME: VI not matching med3
+; VI: v_min_i16
+; VI: v_max_i16
+; VI: v_min_i16
+; VI: v_max_i16
+
+; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %x = load i16, i16 addrspace(1)* %gep0
+  %y = load i16, i16 addrspace(1)* %gep1
+  %z = load i16, i16 addrspace(1)* %gep2
+
+  %tmp0 = call i16 @smin16(i16 %x, i16 %y)
+  %tmp1 = call i16 @smax16(i16 %x, i16 %y)
+  %tmp2 = call i16 @smin16(i16 %tmp1, i16 %z)
+  %tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2)
+  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll
index ce5d92451647..827d672022eb 100644
--- a/test/CodeGen/AMDGPU/sminmax.ll
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@@ -7,7 +7,7 @@
 ; GCN: s_add_i32
 
 ; EG: MAX_INT
-define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
   %res = select i1 %cond, i32 %val, i32 %neg
@@ -22,7 +22,7 @@ define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
 ; GCN: v_add_i32
 
 ; EG: MAX_INT
-define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
   %val = load i32, i32 addrspace(1)* %src, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
@@ -36,7 +36,7 @@ define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind
 ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
 ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]]
 ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]]
-define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
   %val = load i32, i32 addrspace(1)* %src, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
@@ -54,7 +54,7 @@ define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %sr
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -79,7 +79,7 @@ define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -109,7 +109,7 @@ define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -146,7 +146,7 @@ define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -170,7 +170,7 @@ define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 
 ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
 ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
-define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
+define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
   %cond0 = icmp sgt i32 %val0, %val1
   %sel0 = select i1 %cond0, i32 %val0, i32 %val1
   %sel1 = select i1 %cond0, i32 %val1, i32 %val0
@@ -186,7 +186,7 @@ define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 
 ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
 ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
-define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
   %val0 = load volatile i32, i32 addrspace(1)* %ptr0
   %val1 = load volatile i32, i32 addrspace(1)* %ptr1
 
@@ -208,7 +208,7 @@ define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
-define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
+define amdgpu_kernel void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
   %cond0 = icmp sgt <4 x i32> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i32> %val0, <4 x i32> %val1
   %sel1 = select <4 x i1> %cond0, <4 x i32> %val1, <4 x i32> %val0
@@ -223,7 +223,7 @@ define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
-define void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+define amdgpu_kernel void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
   %val0 = load volatile i32, i32 addrspace(1)* %ptr0
   %val1 = load volatile i32, i32 addrspace(1)* %ptr1
 
diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
new file mode 100644
index 000000000000..4e093cdece21
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -0,0 +1,224 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}s_abs_v2i16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
+; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+
+; VI: v_sub_i32_e32
+; VI-DAG: v_sub_i32_e32
+; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_add_i32_e32
+; VI: v_add_i32_e32
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+; CI: v_sub_i32_e32
+; CI-DAG: v_sub_i32_e32
+; CI: v_bfe_i32
+; CI-DAG: v_bfe_i32
+; CI-DAG: v_add_i32_e32
+; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
+; CI: v_add_i32_e32
+; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
+; CI: v_or_b32_e32
+define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+  %neg = sub <2 x i16> zeroinitializer, %val
+  %cond = icmp sgt <2 x i16> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
+  %res2 = add <2 x i16> %res, <i16 2, i16 2>
+  store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_abs_v2i16:
+; GFX9: flat_load_dword [[VAL:v[0-9]+]]
+; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
+; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
+; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
+; VI-NOT: v_and_b32
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %val = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in, align 4
+  %neg = sub <2 x i16> zeroinitializer, %val
+  %cond = icmp sgt <2 x i16> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
+  %res2 = add <2 x i16> %res, <i16 2, i16 2>
+  store <2 x i16> %res2, <2 x i16> addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_abs_v2i16_2:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
+; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+  %z0 = insertelement <2 x i16> undef, i16 0, i16 0
+  %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
+  %t0 = insertelement <2 x i16> undef, i16 2, i16 0
+  %t1 = insertelement <2 x i16> %t0, i16 2, i16 1
+  %neg = sub <2 x i16> %z1, %val
+  %cond = icmp sgt <2 x i16> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
+  %res2 = add <2 x i16> %res, %t1
+  store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_abs_v2i16_2:
+; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
+; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+  %z0 = insertelement <2 x i16> undef, i16 0, i16 0
+  %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
+  %t0 = insertelement <2 x i16> undef, i16 2, i16 0
+  %t1 = insertelement <2 x i16> %t0, i16 2, i16 1
+  %val = load <2 x i16>, <2 x i16> addrspace(1)* %src, align 4
+  %neg = sub <2 x i16> %z1, %val
+  %cond = icmp sgt <2 x i16> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
+  %res2 = add <2 x i16> %res, %t1
+  store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_abs_v4i16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, [[VAL0]]
+; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], [[VAL0]], [[SUB0]]
+; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2
+
+; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, [[VAL1]]
+; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], [[VAL1]], [[SUB1]]
+; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
+define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
+  %z0 = insertelement <4 x i16> undef, i16 0, i16 0
+  %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
+  %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
+  %z3 = insertelement <4 x i16> %z2, i16 0, i16 3
+  %t0 = insertelement <4 x i16> undef, i16 2, i16 0
+  %t1 = insertelement <4 x i16> %t0, i16 2, i16 1
+  %t2 = insertelement <4 x i16> %t1, i16 2, i16 2
+  %t3 = insertelement <4 x i16> %t2, i16 2, i16 3
+  %neg = sub <4 x i16> %z3, %val
+  %cond = icmp sgt <4 x i16> %val, %neg
+  %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg
+  %res2 = add <4 x i16> %res, %t3
+  store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_abs_v4i16:
+; GFX9: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+
+; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]]
+; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]]
+; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2
+
+; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]]
+; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]]
+; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
+define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 {
+  %z0 = insertelement <4 x i16> undef, i16 0, i16 0
+  %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
+  %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
+  %z3 = insertelement <4 x i16> %z2, i16 0, i16 3
+  %t0 = insertelement <4 x i16> undef, i16 2, i16 0
+  %t1 = insertelement <4 x i16> %t0, i16 2, i16 1
+  %t2 = insertelement <4 x i16> %t1, i16 2, i16 2
+  %t3 = insertelement <4 x i16> %t2, i16 2, i16 3
+  %val = load <4 x i16>, <4 x i16> addrspace(1)* %src, align 4
+  %neg = sub <4 x i16> %z3, %val
+  %cond = icmp sgt <4 x i16> %val, %neg
+  %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg
+  %res2 = add <4 x i16> %res, %t3
+  store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_min_max_v2i16:
+define amdgpu_kernel void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 {
+  %cond0 = icmp sgt <2 x i16> %val0, %val1
+  %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
+  %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
+
+  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
+  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_min_max_v2i16:
+define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
+  %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
+  %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
+
+  %cond0 = icmp sgt <2 x i16> %val0, %val1
+  %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
+  %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
+
+  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
+  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_min_max_v4i32:
+define amdgpu_kernel void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
+  %cond0 = icmp sgt <4 x i16> %val0, %val1
+  %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1
+  %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0
+
+  store volatile <4 x i16> %sel0, <4 x i16> addrspace(1)* %out0, align 4
+  store volatile <4 x i16> %sel1, <4 x i16> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_min_max_v2i16_user:
+define amdgpu_kernel void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
+  %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
+  %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
+
+  %cond0 = icmp sgt <2 x i16> %val0, %val1
+  %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
+  %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
+
+  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
+  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  store volatile <2 x i1> %cond0, <2 x i1> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}u_min_max_v2i16:
+; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
+  %cond0 = icmp ugt <2 x i16> %val0, %val1
+  %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
+  %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
+
+  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
+  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
index daac5b92b1ef..343211b0219c 100644
--- a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -12,7 +12,7 @@
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
+define amdgpu_kernel void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
 entry:
   %cnd = fcmp oeq float 0.0, %cond
   %sgpr = load volatile i32, i32 addrspace(2)* %in
@@ -32,7 +32,7 @@ endif:
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
+define amdgpu_kernel void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
   %vgpr = load volatile float, float addrspace(1)* %in
   %cnd = fcmp oeq float 0.0, %vgpr
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 9b118425f9cb..50f72c670598 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -1,16 +1,16 @@
-; RUN: llc < %s -march=amdgcn -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s
-; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=CI --check-prefix=GCN  %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN  %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SIVI %s
 
 ; SMRD load with an immediate offset.
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -18,11 +18,11 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -33,11 +33,11 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -48,11 +48,11 @@ entry:
 ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; GCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -62,11 +62,11 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -76,11 +76,11 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -88,12 +88,12 @@ entry:
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
@@ -102,14 +102,15 @@ main_body:
 ; GCN-LABEL: {{^}}smrd_load_const1:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1020)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
+
 ; SMRD load using the load.const intrinsic with an offset greater than the
 ; largets possible immediate.
 ; immediate offset.
@@ -118,12 +119,12 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1024)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
@@ -133,12 +134,12 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048572)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
@@ -148,18 +149,17 @@ main_body:
 ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048576)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sopk-compares.ll b/test/CodeGen/AMDGPU/sopk-compares.ll
index 74acc5bc961c..c0f773ca70c2 100644
--- a/test/CodeGen/AMDGPU/sopk-compares.ll
+++ b/test/CodeGen/AMDGPU/sopk-compares.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.groupstaticsize() #1
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}}
-define void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -25,7 +25,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -41,7 +41,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max_p1:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -57,7 +57,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32_simm16_max_p1:
 ; GCN: s_cmpk_lg_u32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -73,7 +73,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -89,7 +89,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min_m1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -105,7 +105,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm15_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -121,7 +121,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -137,7 +137,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max_p1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0x10000{{$}}
-define void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65536
   br i1 %cmp0, label %endif, label %if
@@ -154,7 +154,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -170,7 +170,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -186,7 +186,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -202,7 +202,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -218,7 +218,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max_p1:
 ; GCN: s_cmp_gt_i32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -234,7 +234,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sge_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %cond, %size
@@ -251,7 +251,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_slt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp slt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -267,7 +267,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sle_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %cond, %size
@@ -284,7 +284,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ugt_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %cond, %size
@@ -301,7 +301,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_uge_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %cond, %size
@@ -318,7 +318,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -334,7 +334,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16:
 ; GCN: s_cmp_lt_u32 s2, 0xffff8000
-define void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -350,7 +350,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16_m1:
 ; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -366,7 +366,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ule_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %cond, %size
@@ -383,7 +383,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp eq i32 %size, %cond
@@ -400,7 +400,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ne i32 %size, %cond
@@ -417,7 +417,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_sgt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sgt i32 %size, %cond
@@ -434,7 +434,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_sge_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %size, %cond
@@ -451,7 +451,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_slt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp slt i32 %size, %cond
@@ -468,7 +468,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_sle_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %size, %cond
@@ -485,7 +485,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ugt_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %size, %cond
@@ -502,7 +502,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_uge_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %size, %cond
@@ -519,7 +519,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ult_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ult i32 %size, %cond
@@ -536,7 +536,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ule_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %size, %cond
@@ -553,7 +553,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16:
 ; GCN: s_cmp_lt_u32 s2, 0xfffff7ff
-define void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %not.size = xor i32 %size, -1
@@ -573,7 +573,7 @@ endif:
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_eq_u64_e64
-define void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -593,7 +593,7 @@ endif:
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: v_cmp_eq_u64_e32
-define void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 1234
   br i1 %cmp0, label %endif, label %if
@@ -611,7 +611,7 @@ endif:
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_ne_u64_e64
-define void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -631,7 +631,7 @@ endif:
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: v_cmp_ne_u64_e32
-define void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 1234
   br i1 %cmp0, label %endif, label %if
diff --git a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
index ff9429843b22..63ea21b05339 100644
--- a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -4,7 +4,7 @@
 ; allocate scratch registers correctly. Check that this test compiles without
 ; error.
 ; TONGA-LABEL: test
-define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
 entry:
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
diff --git a/test/CodeGen/AMDGPU/spill-cfg-position.ll b/test/CodeGen/AMDGPU/spill-cfg-position.ll
new file mode 100644
index 000000000000..1ca0919258a8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -0,0 +1,78 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s
+
+; Inline spiller can decide to move a spill as early as possible in the basic block.
+; It will skip phis and label, but we also need to make sure it skips instructions
+; in the basic block prologue which restore exec mask.
+; Make sure instruction to restore exec mask immediately follows label
+
+; CHECK-LABEL: {{^}}spill_cfg_position:
+; CHECK: s_cbranch_execz [[LABEL1:BB[0-9_]+]]
+; CHECK: {{^}}[[LABEL1]]:
+; CHECK: s_cbranch_execz [[LABEL2:BB[0-9_]+]]
+; CHECK: {{^}}[[LABEL2]]:
+; CHECK-NEXT: s_or_b64 exec
+; CHECK: buffer_
+
+define amdgpu_kernel void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %tmp14 = load i32, i32 addrspace(1)* %arg, align 4
+  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
+  %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  %tmp18 = load i32, i32 addrspace(1)* %tmp17, align 4
+  %tmp19 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+  %tmp20 = load i32, i32 addrspace(1)* %tmp19, align 4
+  %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4
+  %tmp22 = load i32, i32 addrspace(1)* %tmp21, align 4
+  %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 5
+  %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4
+  %tmp25 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 6
+  %tmp26 = load i32, i32 addrspace(1)* %tmp25, align 4
+  %tmp27 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 7
+  %tmp28 = load i32, i32 addrspace(1)* %tmp27, align 4
+  %tmp29 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8
+  %tmp30 = load i32, i32 addrspace(1)* %tmp29, align 4
+  %tmp33 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp1
+  %tmp34 = load i32, i32 addrspace(1)* %tmp33, align 4
+  %tmp35 = icmp eq i32 %tmp34, 0
+  br i1 %tmp35, label %bb44, label %bb36
+
+bb36:                                             ; preds = %bb
+  %tmp37 = mul nsw i32 %tmp20, %tmp18
+  %tmp38 = add nsw i32 %tmp37, %tmp16
+  %tmp39 = mul nsw i32 %tmp24, %tmp22
+  %tmp40 = add nsw i32 %tmp38, %tmp39
+  %tmp41 = mul nsw i32 %tmp28, %tmp26
+  %tmp42 = add nsw i32 %tmp40, %tmp41
+  %tmp43 = add nsw i32 %tmp42, %tmp30
+  br label %bb52
+
+bb44:                                             ; preds = %bb
+  %tmp45 = mul nsw i32 %tmp18, %tmp16
+  %tmp46 = mul nsw i32 %tmp22, %tmp20
+  %tmp47 = add nsw i32 %tmp46, %tmp45
+  %tmp48 = mul nsw i32 %tmp26, %tmp24
+  %tmp49 = add nsw i32 %tmp47, %tmp48
+  %tmp50 = mul nsw i32 %tmp30, %tmp28
+  %tmp51 = add nsw i32 %tmp49, %tmp50
+  br label %bb52
+
+bb52:                                             ; preds = %bb44, %bb36
+  %tmp53 = phi i32 [ %tmp43, %bb36 ], [ %tmp51, %bb44 ]
+  %tmp54 = mul nsw i32 %tmp16, %tmp14
+  %tmp55 = mul nsw i32 %tmp22, %tmp18
+  %tmp56 = mul nsw i32 %tmp24, %tmp20
+  %tmp57 = mul nsw i32 %tmp30, %tmp26
+  %tmp58 = add i32 %tmp55, %tmp54
+  %tmp59 = add i32 %tmp58, %tmp56
+  %tmp60 = add i32 %tmp59, %tmp28
+  %tmp61 = add i32 %tmp60, %tmp57
+  %tmp62 = add i32 %tmp61, %tmp53
+  store i32 %tmp62, i32 addrspace(1)* %tmp33, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll
index 8c16b9d1649c..0e715c453209 100644
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@@ -17,11 +17,11 @@
 
 ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
-; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill
+; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill
 ; TOVMEM: s_waitcnt vmcnt(0)
 
 ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
-; TOSMEM: s_mov_b32 m0, s3{{$}}
+; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; TOSMEM-NOT: [[M0_COPY]]
 ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill
 ; TOSMEM: s_waitcnt lgkmcnt(0)
@@ -32,18 +32,18 @@
 ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 0
 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload
+; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Reload
 ; TOVMEM: s_waitcnt vmcnt(0)
 ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]]
 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOSMEM: s_mov_b32 m0, s3{{$}}
+; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload
 ; TOSMEM-NOT: [[M0_RESTORE]]
 ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]]
 
 ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1
-define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
   %cmp0 = icmp eq i32 %cond, 0
@@ -67,12 +67,12 @@ endif:
 ; GCN: v_interp_mov_f32
 
 ; TOSMEM-NOT: s_m0
-; TOSMEM: s_mov_b32 m0, s7
+; TOSMEM: s_add_u32 m0, s7, 0x100
 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
 ; TOSMEM-NOT: m0
 
 ; TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s7, 0x100
+; TOSMEM: s_add_u32 m0, s7, 0x200
 ; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
 ; TOSMEM-NOT: m0
 
@@ -81,16 +81,16 @@ endif:
 ; TOSMEM: s_branch
 
 ; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100
+; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200
 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
 
 
 ; GCN-NOT: v_readlane_b32 m0
 ; GCN-NOT: s_buffer_store_dword m0
 ; GCN-NOT: s_buffer_load_dword m0
-define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) #0 {
+define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %m0) #0 {
 main_body:
-  %tmp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3)
+  %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0)
   %cmp = fcmp ueq float 0.000000e+00, %tmp
   br i1 %cmp, label %if, label %else
 
@@ -100,14 +100,13 @@ if:                                               ; preds = %main_body
   br label %endif
 
 else:                                             ; preds = %main_body
-  %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3)
+  %interp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0)
   br label %endif
 
 endif:                                            ; preds = %else, %if
   %export = phi float [ %lds_data, %if ], [ %interp, %else ]
-  %tmp4 = call i32 @llvm.SI.packf16(float %export, float %export)
-  %tmp5 = bitcast i32 %tmp4 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp5, float %tmp5, float %tmp5, float %tmp5)
+  %tmp4 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %export, float %export)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp4, <2 x half> %tmp4, i1 true, i1 true) #0
   ret void
 }
 
@@ -122,7 +121,7 @@ endif:                                            ; preds = %else, %if
 ; GCN: ; clobber m0
 
 ; TOSMEM: s_mov_b32 vcc_hi, m0
-; TOSMEM: s_mov_b32 m0, s3
+; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
 ; TOSMEM: s_mov_b32 m0, vcc_hi
 
@@ -131,16 +130,16 @@ endif:                                            ; preds = %else, %if
 ; TOSMEM: s_branch
 
 ; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM-NEXT: s_mov_b32 m0, s3
+; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100
 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
 
 ; GCN-NOT: v_readlane_b32 m0
 ; GCN-NOT: s_buffer_store_dword m0
 ; GCN-NOT: s_buffer_load_dword m0
-define void @m0_unavailable_spill(i32 %arg3) #0 {
+define amdgpu_kernel void @m0_unavailable_spill(i32 %m0.arg) #0 {
 main_body:
   %m0 = call i32 asm sideeffect "; def $0, 1", "={M0}"() #0
-  %tmp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3)
+  %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0.arg)
   call void asm sideeffect "; clobber $0", "~{M0}"() #0
   %cmp = fcmp ueq float 0.000000e+00, %tmp
    br i1 %cmp, label %if, label %else
@@ -161,10 +160,10 @@ endif:
 ; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
 ; TOSMEM: s_cmp_eq_u32
 ; TOSMEM-NOT: m0
-; TOSMEM: s_mov_b32 m0, s3
+; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
 ; TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x200
+; TOSMEM: s_add_u32 m0, s3, 0x300
 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
 ; TOSMEM-NOT: m0
 ; TOSMEM: s_cbranch_scc1
@@ -172,7 +171,7 @@ endif:
 ; TOSMEM: s_mov_b32 m0, -1
 
 ; TOSMEM: s_mov_b32 vcc_hi, m0
-; TOSMEM: s_mov_b32 m0, s3
+; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
 ; TOSMEM: s_mov_b32 m0, vcc_hi
 ; TOSMEM: s_waitcnt lgkmcnt(0)
@@ -180,7 +179,7 @@ endif:
 ; TOSMEM: ds_write_b64
 
 ; TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x200
+; TOSMEM: s_add_u32 m0, s3, 0x300
 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
 ; TOSMEM-NOT: m0
 ; TOSMEM: s_waitcnt lgkmcnt(0)
@@ -190,7 +189,7 @@ endif:
 
 ; TOSMEM: s_dcache_wb
 ; TOSMEM: s_endpgm
-define void @restore_m0_lds(i32 %arg) {
+define amdgpu_kernel void @restore_m0_lds(i32 %arg) {
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
   %sval = load volatile i64, i64 addrspace(2)* undef
   %cmp = icmp eq i32 %arg, 0
@@ -205,10 +204,10 @@ ret:
   ret void
 }
 
-declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
-
-declare i32 @llvm.SI.packf16(float, float) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
 
 attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 9b3dfab2be6a..c05021a91ff0 100644
--- a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -11,7 +11,7 @@
 
 ; Just test that it compiles successfully.
 ; CHECK-LABEL: test
-define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
 entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
diff --git a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
index cab45be8da50..ebba35a6689a 100644
--- a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
+++ b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
@@ -3,11 +3,11 @@
 ; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s
 
 ; ALL-LABEL: {{^}}spill_sgpr_x2:
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload
 
 ; SMEM: s_dcache_wb
@@ -44,11 +44,11 @@ ret:
 }
 
 ; ALL-LABEL: {{^}}spill_sgpr_x4:
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Reload
 ; SMEM: s_dcache_wb
 ; SMEM: s_endpgm
@@ -93,15 +93,15 @@ ret:
 
 ; ALL-LABEL: {{^}}spill_sgpr_x8:
 
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
-; SMEM: s_add_u32 m0, s3, 16
+; SMEM: s_add_u32 m0, s3, 0x110{{$}}
 ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
-; SMEM: s_add_u32 m0, s3, 16
+; SMEM: s_add_u32 m0, s3, 0x110{{$}}
 ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
 
 ; SMEM: s_dcache_wb
diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index d4e2dc814050..5d7d29db3a2f 100644
--- a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
 ; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
 ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
-define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
   %v.val = load volatile i32, i32 addrspace(1)* %in
   %vec.0 = insertelement <2 x i32> undef, i32 %s.val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
@@ -23,7 +23,7 @@ define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_0:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x18f
 ; SI: s_addc_u32 {{s[0-9]+}}, 0xf423f, 0
-define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -35,7 +35,7 @@ define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %v.val = load volatile i32, i32 addrspace(1)* %in
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
@@ -48,7 +48,7 @@ define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_1:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: s_addc_u32 {{s[0-9]+}}, 0x1869f, {{s[0-9]+}}
-define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -61,7 +61,7 @@ define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i6
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_2:
 ; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc
-define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %load = load i32, i32 addrspace(1)* %gep
diff --git a/test/CodeGen/AMDGPU/split-smrd.ll b/test/CodeGen/AMDGPU/split-smrd.ll
index d07da1030936..cdb1b1e3b503 100644
--- a/test/CodeGen/AMDGPU/split-smrd.ll
+++ b/test/CodeGen/AMDGPU/split-smrd.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; FIXME: Move this to sgpr-copy.ll when this is fixed on VI.
 ; Make sure that when we split an smrd instruction in order to move it to
 ; the VALU, we are also moving its users to the VALU.
-; CHECK-LABEL: {{^}}split_smrd_add_worklist:
-; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 
+; GCN-LABEL: {{^}}split_smrd_add_worklist:
+; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
 bb:
   %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
@@ -21,27 +21,22 @@ bb3:                                              ; preds = %bb
   %tmp6 = sext i32 %tmp5 to i64
   %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6
   %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
-  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float bitcast (i32 1061158912 to float), float bitcast (i32 1048576000 to float)>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp10 = extractelement <4 x float> %tmp9, i32 0
-  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
-  %tmp13 = bitcast i32 %tmp12 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp13, float undef, float undef)
+  %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare i32 @llvm.SI.packf16(float, float) #1
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
-!1 = !{!"const", !3}
-!2 = !{!1, !1, i64 0}
-!3 = !{!"tbaa root"}
+!1 = !{!"const", !2}
+!2 = !{!"tbaa root"}
diff --git a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
index 37ec2b012896..c2426993bb3a 100644
--- a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
+++ b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s
 
 @sPrivateStorage = internal addrspace(3) global [256 x [8 x <4 x i64>]] undef
 
@@ -29,7 +29,7 @@
 ; GCN-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24
 
 ; GCN: s_endpgm
-define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 {
+define amdgpu_kernel void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 {
 entry:
   %tmp = tail call i32 @llvm.r600.read.local.size.y()
   %tmp1 = tail call i32 @llvm.r600.read.local.size.z()
diff --git a/test/CodeGen/AMDGPU/splitkit.mir b/test/CodeGen/AMDGPU/splitkit.mir
new file mode 100644
index 000000000000..41782af40e3c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/splitkit.mir
@@ -0,0 +1,105 @@
+# RUN: llc -o - %s -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
+--- |
+  define amdgpu_kernel void @func0() #0 { ret void }
+  define amdgpu_kernel void @func1() #0 { ret void }
+  define amdgpu_kernel void @splitHoist() #0 { ret void }
+
+  attributes #0 = { "amdgpu-num-sgpr"="12" }
+...
+---
+# Make sure we only get a single spill+reload even if liverange splitting
+# created a sequence of multiple copy instructions.
+# CHECK-LABEL: name: func0
+# CHECK: SI_SPILL_S128_SAVE
+# CHECK-NOT: SI_SPILL_S128_SAVE
+# CHECK: S_NOP 0
+# CHECK: SI_SPILL_S128_RESTORE
+# CHECK-NOT: SI_SPILL_S128_RESTORE
+name: func0
+body: |
+  bb.0:
+    S_NOP 0, implicit-def undef %0.sub0 : sreg_128
+    S_NOP 0, implicit-def %0.sub3 : sreg_128
+
+    ; Clobber registers
+    S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11
+
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub3
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub3
+...
+---
+# LiveRange splitting should split this into 2 intervals with the second getting
+# allocated to sgpr0_sgpr1 and the first to something else so we see two copies
+# in between for the two subregisters that are alive.
+# CHECK-LABEL: name: func1
+# CHECK: [[REG0:%sgpr[0-9]+]] = COPY %sgpr0
+# CHECK: [[REG1:%sgpr[0-9]+]] = COPY %sgpr2
+# CHECK: S_NOP 0
+# CHECK: S_NOP 0, implicit [[REG0]]
+# CHECK: S_NOP 0, implicit [[REG1]]
+# CHECK: %sgpr0 = COPY [[REG0]]
+# CHECK: %sgpr2 = COPY [[REG1]]
+# CHECK: S_NOP
+# CHECK: S_NOP 0, implicit %sgpr0
+# CHECK: S_NOP 0, implicit %sgpr2
+name: func1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %sgpr0, %sgpr1, %sgpr2
+    undef %0.sub0 : sreg_128 = COPY %sgpr0
+    %0.sub2 = COPY %sgpr2
+
+    S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1
+
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub2
+
+    ; Clobber everything but sgpr0-sgpr3
+    S_NOP 0, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11, implicit-def dead %sgpr12, implicit-def dead %sgpr13, implicit-def dead %sgpr14, implicit-def dead %sgpr15, implicit-def dead %vcc_lo, implicit-def dead %vcc_hi
+
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub2
+...
+---
+# Check that copy hoisting out of loops works. This mainly should not crash the
+# compiler when it hoists a subreg copy sequence.
+# CHECK-LABEL: name: splitHoist
+# CHECK: S_NOP 0, implicit-def %sgpr0
+# CHECK: S_NOP 0, implicit-def %sgpr3
+# CHECK-NEXT: SI_SPILL_S128_SAVE
+name: splitHoist
+tracksRegLiveness: true
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_NOP 0, implicit-def undef %0.sub0 : sreg_128
+    S_NOP 0, implicit-def %0.sub3 : sreg_128
+
+    S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.1, %bb.3
+    S_NOP 0, implicit %0.sub0
+
+    ; Clobber registers
+    S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11
+
+    S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.3
+    ; Clobber registers
+    S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11
+    S_BRANCH %bb.3
+
+  bb.3:
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub3
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub3
+...
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index ad7c86fe7919..b4355b76016a 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -37,7 +37,7 @@ define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -49,9 +49,9 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; FUNC-LABEL: {{^}}ashr_v2i16:
 ; FIXME: The ashr operation is uniform, but because its operands come from a
 ; global load we end up with the vector instructions rather than scalar.
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
@@ -63,11 +63,11 @@ define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %i
 ; FUNC-LABEL: {{^}}ashr_v4i16:
 ; FIXME: The ashr operation is uniform, but because its operands come from a
 ; global load we end up with the vector instructions rather than scalar.
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
   %a = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
@@ -80,7 +80,7 @@ define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %i
 ; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 
 ; EG: ASHR
-define void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %in.ext = sext i32 %in to i64
   %ashr = ashr i64 %in.ext, 8
@@ -105,7 +105,7 @@ entry:
 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
@@ -143,7 +143,7 @@ entry:
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -156,7 +156,7 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; XFUNC-LABEL: {{^}}s_ashr_v2i64:
 ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
 ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
-; define void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
+; define amdgpu_kernel void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
 ;   %result = ashr <2 x i64> %a, %b
 ;   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
 ;   ret void
@@ -221,7 +221,7 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -235,7 +235,7 @@ define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}}
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}}
-define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = ashr i64 %a, 32
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -247,7 +247,7 @@ define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; VI: flat_load_dword v[[HI:[0-9]+]]
 ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[HI]]:[[SHIFT]]{{\]}}
-define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -262,7 +262,7 @@ define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
 ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
-define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = ashr i64 %a, 63
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -275,7 +275,7 @@ define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
 ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[SHIFT]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[SHIFT]]:[[COPY]]{{\]}}
-define void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll
index c78fd549b316..c89f798397ae 100644
--- a/test/CodeGen/AMDGPU/srem.ll
+++ b/test/CodeGen/AMDGPU/srem.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s
 
-define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in
   %den = load i32, i32 addrspace(1) * %den_ptr
@@ -11,7 +11,7 @@ define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = srem i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
@@ -24,14 +24,14 @@ define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_mul_lo_i32
 ; SI: v_sub_i32
 ; SI: s_endpgm
-define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = srem i32 %num, 7
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
@@ -40,14 +40,14 @@ define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = srem <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
@@ -56,14 +56,14 @@ define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %num = load i64, i64 addrspace(1) * %in
   %den = load i64, i64 addrspace(1) * %den_ptr
@@ -72,14 +72,14 @@ define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %num = load i64, i64 addrspace(1) * %in
   %result = srem i64 %num, 4
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
   %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr
@@ -88,14 +88,14 @@ define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
   %result = srem <2 x i64> %num, <i64 4, i64 4>
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
   %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr
@@ -104,7 +104,7 @@ define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
   %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index 6b006fd936d7..1daf4bb33e81 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -26,7 +26,7 @@ define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -50,7 +50,7 @@ define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -74,7 +74,7 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
 ; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -112,7 +112,7 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; EG-DAG: CNDE_INT {{.*}}, 0.0
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -178,7 +178,7 @@ define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -193,7 +193,7 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = lshr i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -203,7 +203,7 @@ define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
 ; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
-define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll
index 26884a1b7761..135632343f90 100644
--- a/test/CodeGen/AMDGPU/ssubo.ll
+++ b/test/CodeGen/AMDGPU/ssubo.ll
@@ -6,7 +6,7 @@ declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
 
 ; FUNC-LABEL: {{^}}ssubo_i64_zext:
-define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -17,7 +17,7 @@ define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}s_ssubo_i32:
-define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
   %carry = extractvalue { i32, i1 } %ssub, 1
@@ -27,7 +27,7 @@ define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}v_ssubo_i32:
-define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -41,7 +41,7 @@ define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 ; FUNC-LABEL: {{^}}s_ssubo_i64:
 ; SI: s_sub_u32
 ; SI: s_subb_u32
-define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -53,7 +53,7 @@ define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_ssubo_i64:
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 4
   %b = load i64, i64 addrspace(1)* %bptr, align 4
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll
index 57a93ccd2505..afa4e94222cd 100644
--- a/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/test/CodeGen/AMDGPU/store-barrier.ll
@@ -12,7 +12,7 @@
 ; CHECK: s_barrier
 ; CHECK: s_endpgm
 ; Function Attrs: nounwind
-define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
+define amdgpu_kernel void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
 bb:
   %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
   %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll
index 5d49795a68ec..160e921fc075 100644
--- a/test/CodeGen/AMDGPU/store-global.ll
+++ b/test/CodeGen/AMDGPU/store-global.ll
@@ -11,7 +11,7 @@
 ; CM-NOT: MEM_RAT MSKOR
 
 ; GCN: buffer_store_byte
-define void @store_i1(i1 addrspace(1)* %out) {
+define amdgpu_kernel void @store_i1(i1 addrspace(1)* %out) {
 entry:
   store i1 true, i1 addrspace(1)* %out
   ret void
@@ -42,7 +42,7 @@ entry:
 
 ; GCN: buffer_store_byte
 
-define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(i8 addrspace(1)* %out, i8 %in) {
 entry:
   store i8 %in, i8 addrspace(1)* %out
   ret void
@@ -75,7 +75,7 @@ entry:
 ; EG: MOV * T[[RW_GPR]].Z, 0.0
 
 ; GCN: buffer_store_short
-define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(i16 addrspace(1)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(1)* %out
   ret void
@@ -88,7 +88,7 @@ entry:
 
 ; EG: MEM_RAT MSKOR
 ; EG: MEM_RAT MSKOR
-define void @store_i24(i24 addrspace(1)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(i24 addrspace(1)* %out, i24 %in) {
 entry:
   store i24 %in, i24 addrspace(1)* %out
   ret void
@@ -104,7 +104,7 @@ entry:
 
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 ; CM-NOT: MEM_RAT
-define void @store_i25(i25 addrspace(1)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(i25 addrspace(1)* %out, i25 %in) {
 entry:
   store i25 %in, i25 addrspace(1)* %out
   ret void
@@ -119,7 +119,7 @@ entry:
 ; CM-NOT: MEM_RAT MSKOR
 
 ; GCN: buffer_store_short
-define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(1)* %out
@@ -136,7 +136,7 @@ entry:
 ; CM-NOT: MEM_RAT MSKOR
 
 ; SI: buffer_store_byte
-define void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1
@@ -150,7 +150,7 @@ entry:
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dword
-define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(1)* %out
@@ -170,7 +170,7 @@ entry:
 
 ; SI: buffer_store_short
 ; SI: buffer_store_short
-define void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2
@@ -183,7 +183,7 @@ entry:
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dword
-define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out
@@ -210,7 +210,7 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1
@@ -231,7 +231,7 @@ entry:
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2
@@ -246,7 +246,7 @@ entry:
 
 ; GCN: buffer_store_dword
 
-define void @store_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) {
   store float %in, float addrspace(1)* %out
   ret void
 }
@@ -257,7 +257,7 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
 ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}
 
 ; GCN: buffer_store_dwordx2
-define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
   store <4 x i16> %0, <4 x i16> addrspace(1)* %out
@@ -272,7 +272,7 @@ entry:
 
 ; GCN: buffer_store_dwordx2
 
-define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
@@ -286,7 +286,7 @@ entry:
 
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}},
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}},
-define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
+define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
   ret void
 }
@@ -299,7 +299,7 @@ define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx4
-define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
@@ -313,7 +313,7 @@ entry:
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; SI: buffer_store_dwordx4
-define void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -328,7 +328,7 @@ entry:
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx4
-define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %1 = load <4 x float>, <4 x float> addrspace(1) * %in
   store <4 x float> %1, <4 x float> addrspace(1)* %out
   ret void
@@ -340,7 +340,7 @@ define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1
 ; CM: MEM_RAT MSKOR
 
 ; GCN: buffer_store_byte
-define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(1)* %out
@@ -350,7 +350,7 @@ entry:
 ; FUNC-LABEL: {{^}}store_i64_i16:
 ; EG: MEM_RAT MSKOR
 ; GCN: buffer_store_short
-define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(1)* %out
@@ -369,7 +369,7 @@ entry:
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx2
-define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32, i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
@@ -388,7 +388,7 @@ entry:
 ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X
 
 ; GCN: buffer_store_dwordx4
-define void @i128-const-store(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @i128-const-store(i32 addrspace(1)* %out) {
 entry:
   store i32 1, i32 addrspace(1)* %out, align 4
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
diff --git a/test/CodeGen/AMDGPU/store-local.ll b/test/CodeGen/AMDGPU/store-local.ll
index 03fd30ca9a25..c144bf2aa878 100644
--- a/test/CodeGen/AMDGPU/store-local.ll
+++ b/test/CodeGen/AMDGPU/store-local.ll
@@ -9,7 +9,7 @@
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define void @store_local_i1(i1 addrspace(3)* %out) {
+define amdgpu_kernel void @store_local_i1(i1 addrspace(3)* %out) {
 entry:
   store i1 true, i1 addrspace(3)* %out
   ret void
@@ -21,7 +21,7 @@ entry:
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
+define amdgpu_kernel void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
   store i8 %in, i8 addrspace(3)* %out
   ret void
 }
@@ -32,7 +32,7 @@ define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
 ; CM: LDS_SHORT_WRITE
 
 ; GCN: ds_write_b16
-define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
+define amdgpu_kernel void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
   store i16 %in, i16 addrspace(3)* %out
   ret void
 }
@@ -43,7 +43,7 @@ define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(3)* %out
   ret void
@@ -55,7 +55,7 @@ entry:
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out
   ret void
@@ -78,7 +78,7 @@ entry:
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 1
   ret void
@@ -95,7 +95,7 @@ entry:
 
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 2
   ret void
@@ -111,7 +111,7 @@ entry:
 ; CM-NOT: LDS_WRITE
 
 ; GCN: ds_write_b64
-define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(3)* %out
   ret void
@@ -129,7 +129,7 @@ entry:
 ; CM: LDS_WRITE
 
 ; GCN: ds_write2_b64
-define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
   ret void
@@ -148,7 +148,7 @@ entry:
 
 ; GCN: ds_write2_b32
 ; GCN: ds_write2_b32
-define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ entry:
 ; FUNC-LABEL: {{^}}store_local_i64_i8:
 ; EG: LDS_BYTE_WRITE
 ; GCN: ds_write_b8
-define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(3)* %out
@@ -167,7 +167,7 @@ entry:
 ; FUNC-LABEL: {{^}}store_local_i64_i16:
 ; EG: LDS_SHORT_WRITE
 ; GCN: ds_write_b16
-define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll
index 33d27f24e9cf..ab73ada370ea 100644
--- a/test/CodeGen/AMDGPU/store-private.ll
+++ b/test/CodeGen/AMDGPU/store-private.ll
@@ -15,7 +15,7 @@
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_i1(i1 addrspace(0)* %out) {
+define amdgpu_kernel void @store_i1(i1 addrspace(0)* %out) {
 entry:
   store i1 true, i1 addrspace(0)* %out
   ret void
@@ -44,7 +44,7 @@ entry:
 
 ; SI: buffer_store_byte
 
-define void @store_i8(i8 addrspace(0)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(i8 addrspace(0)* %out, i8 %in) {
 entry:
   store i8 %in, i8 addrspace(0)* %out
   ret void
@@ -72,7 +72,7 @@ entry:
 ; EG: MOV * T(0 + AR.x).X+, [[RES]]
 
 ; SI: buffer_store_short
-define void @store_i16(i16 addrspace(0)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(i16 addrspace(0)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(0)* %out
   ret void
@@ -102,7 +102,7 @@ entry:
 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
-define void @store_i24(i24 addrspace(0)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(i24 addrspace(0)* %out, i24 %in) {
 entry:
   store i24 %in, i24 addrspace(0)* %out
   ret void
@@ -120,7 +120,7 @@ entry:
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 ; CM-NOT: MOVA_INT
-define void @store_i25(i25 addrspace(0)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(i25 addrspace(0)* %out, i25 %in) {
 entry:
   store i25 %in, i25 addrspace(0)* %out
   ret void
@@ -141,7 +141,7 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_short
-define void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(0)* %out
@@ -172,7 +172,7 @@ entry:
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1
@@ -191,7 +191,7 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(0)* %out
@@ -223,7 +223,7 @@ entry:
 
 ; SI: buffer_store_short
 ; SI: buffer_store_short
-define void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2
@@ -240,7 +240,7 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out
@@ -299,7 +299,7 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1
@@ -410,7 +410,7 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
+define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
 entry:
   %0 = trunc <8 x i32> %in to <8 x i8>
   store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1
@@ -443,7 +443,7 @@ entry:
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2
@@ -460,7 +460,7 @@ entry:
 
 ; SI: buffer_store_dword
 
-define void @store_f32(float addrspace(0)* %out, float %in) {
+define amdgpu_kernel void @store_f32(float addrspace(0)* %out, float %in) {
   store float %in, float addrspace(0)* %out
   ret void
 }
@@ -480,7 +480,7 @@ define void @store_f32(float addrspace(0)* %out, float %in) {
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
   store <4 x i16> %0, <4 x i16> addrspace(0)* %out
@@ -504,7 +504,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
@@ -533,7 +533,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
+define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16
   ret void
 }
@@ -563,7 +563,7 @@ define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(0)* %out
   ret void
@@ -594,7 +594,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4
   ret void
@@ -626,7 +626,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
+define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
   %1 = load <4 x float>, <4 x float> addrspace(0) * %in
   store <4 x float> %1, <4 x float> addrspace(0)* %out
   ret void
@@ -644,7 +644,7 @@ define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(0)* %out
@@ -663,7 +663,7 @@ entry:
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_short
-define void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(0)* %out
@@ -689,7 +689,7 @@ entry:
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32, i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
@@ -727,7 +727,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @i128-const-store(i32 addrspace(0)* %out) {
+define amdgpu_kernel void @i128-const-store(i32 addrspace(0)* %out) {
 entry:
   store i32 1, i32 addrspace(0)* %out, align 4
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1
diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll
index 78db2d37724b..7518e887135c 100644
--- a/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}global_store_v3i64:
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
   ret void
 }
@@ -40,7 +40,7 @@ define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
   ret void
 }
@@ -48,7 +48,7 @@ define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64
 ; GCN-LABEL: {{^}}local_store_v3i64:
 ; GCN: ds_write2_b64
 ; GCN: ds_write_b64
-define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
   ret void
 }
@@ -83,7 +83,7 @@ define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 1
   ret void
 }
@@ -91,7 +91,7 @@ define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64>
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
 ; GCN-DAG: buffer_store_dwordx2
 ; GCN-DAG: buffer_store_dword v
-define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i32>
   store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
   ret void
@@ -100,7 +100,7 @@ define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i16:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_dword v
-define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i16>
   store <3 x i16> %trunc, <3 x i16> addrspace(1)* %out
   ret void
@@ -110,7 +110,7 @@ define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i8:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_byte v
-define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i8>
   store <3 x i8> %trunc, <3 x i8> addrspace(1)* %out
   ret void
@@ -120,7 +120,7 @@ define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
-define void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i1>
   store <3 x i1> %trunc, <3 x i1> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/test/CodeGen/AMDGPU/store-vector-ptrs.ll
index d5af3b29118a..507f07dee052 100644
--- a/test/CodeGen/AMDGPU/store-vector-ptrs.ll
+++ b/test/CodeGen/AMDGPU/store-vector-ptrs.ll
@@ -5,7 +5,7 @@
 ; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
 ; scratch loads and stores.
 ; CHECK-LABEL: {{^}}store_vector_ptrs:
-define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
+define amdgpu_kernel void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   store <4 x i32*> %p, <4 x i32*>* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/store_typed.ll b/test/CodeGen/AMDGPU/store_typed.ll
index 515fcf04f406..eaa21617f937 100644
--- a/test/CodeGen/AMDGPU/store_typed.ll
+++ b/test/CodeGen/AMDGPU/store_typed.ll
@@ -6,7 +6,7 @@
 ; EG: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}, 1
 ; CM: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}
 
-define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
+define amdgpu_kernel void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
   call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 0)
   ret void
 }
@@ -16,7 +16,7 @@ define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
 ; EG: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}, 1
 ; CM: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}
 
-define void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) {
+define amdgpu_kernel void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) {
   call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 11)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/structurize.ll b/test/CodeGen/AMDGPU/structurize.ll
index 174e64e2cf8b..3cceb2d45c93 100644
--- a/test/CodeGen/AMDGPU/structurize.ll
+++ b/test/CodeGen/AMDGPU/structurize.ll
@@ -45,7 +45,7 @@
 ; CHECK: CF_END
 
 
-define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
 %0 = icmp ne i32 %a, 0
   br i1 %0, label %diamond_head, label %branch_from
diff --git a/test/CodeGen/AMDGPU/structurize1.ll b/test/CodeGen/AMDGPU/structurize1.ll
index db0f50247e38..2e7d0e615e07 100644
--- a/test/CodeGen/AMDGPU/structurize1.ll
+++ b/test/CodeGen/AMDGPU/structurize1.ll
@@ -19,7 +19,7 @@
 ; CHECK-LABEL: {{^}}if_inside_loop:
 ; CHECK: LOOP_START_DX10
 ; CHECK: END_LOOP
-define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   br label %for.body
 
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll
index b5d5f56b2796..6642411f7a63 100644
--- a/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -7,7 +7,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -24,7 +24,7 @@ define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffff85, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -39,7 +39,7 @@ define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -52,9 +52,9 @@ define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_i16_inline_63:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffc1, [[A]]
+; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -70,7 +70,7 @@ define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)*
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -85,12 +85,12 @@ define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
+; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
-; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -110,7 +110,7 @@ define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -131,7 +131,7 @@ define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -149,7 +149,7 @@ define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute:
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
 ; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}}
-define void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %size.trunc = trunc i32 %size to i16
   call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index 5816345098af..f366029fdea2 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -25,7 +25,7 @@ define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -45,7 +45,7 @@ define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -55,7 +55,7 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)
 }
 
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
     %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
     %a = load i16, i16 addrspace(1)* %in 
     %b = load i16, i16 addrspace(1)* %b_ptr
@@ -69,7 +69,7 @@ define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
     %a = load <2 x i16>, <2 x i16> addrspace(1) * %in 
     %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
@@ -85,7 +85,7 @@ define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
     %a = load <4 x i16>, <4 x i16> addrspace(1) * %in 
     %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
@@ -103,7 +103,7 @@ define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT {{[* ]*}}
-define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
   %result = sub i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -118,7 +118,7 @@ define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT {{[* ]*}}
-define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
+define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@@ -134,7 +134,7 @@ define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias
 ; SI: v_subb_u32_e32
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@@ -154,7 +154,7 @@ define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(
 ; SI: v_subb_u32_e32
 ; SI: v_subrev_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll
new file mode 100644
index 000000000000..69f0accef628
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -0,0 +1,278 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16:
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_sub_v2i16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
+
+; VI: s_sub_i32
+; VI: s_sub_i32
+define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+  %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
+  %add = sub <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_sub_self_v2i16:
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]]
+; GCN: buffer_store_dword [[ZERO]]
+define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+  %add = sub <2 x i16> %a, %a
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: VI should not scalarize arg access.
+; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg:
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+
+; VI: v_subrev_i32_e32
+; VI: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+  %add = sub <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_sub_v2i16_constant:
+; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}}
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 123, i16 456>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant:
+; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}}
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 -845, i16 -991>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1:
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
+
+; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
+; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD0]]
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 -1, i16 -1>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi:
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+
+; VI-NOT: v_subrev_i16
+; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}}
+; VI-NOT: v_subrev_i16
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 32, i16 0>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; The high element gives fp
+; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split:
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+
+; VI-NOT: v_subrev_i16
+; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffc080, v{{[0-9]+}}
+; VI-NOT: v_subrev_i16
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 0, i16 16256>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i32:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
+; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+
+; VI: v_subrev_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
+; VI-NOT: and
+; VI-NOT: shl
+; VI: v_subrev_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; VI-NOT: and
+; VI-NOT: shl
+; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  %ext = zext <2 x i16> %add to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
+; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx4
+
+; VI: flat_load_ushort v[[A_LO:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+
+; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI-DAG: v_subrev_u16_e32
+; VI-DAG: v_subrev_u16_e32
+
+; VI: buffer_store_dwordx4
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  %ext = zext <2 x i16> %add to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i32:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16
+; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+
+; VI: v_subrev_u16_e32
+; VI: v_subrev_u16_e32
+; VI: buffer_store_dwordx2
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  %ext = sext <2 x i16> %add to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i64:
+; GCN: flat_load_dword
+; GCN: flat_load_dword
+
+; GFX9: v_pk_sub_i16
+; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+
+; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_subrev_u16_e32
+
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  %ext = sext <2 x i16> %add to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
index ec2ed78b4954..c2d04abf829f 100644
--- a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
@@ -1,39 +1,37 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
-; SI-LABEL:{{^}}row_filter_C1_D0:
-; SI: s_endpgm
-; Function Attrs: nounwind
-define void @row_filter_C1_D0() {
+; GCN-LABEL:{{^}}row_filter_C1_D0:
+define amdgpu_kernel void @row_filter_C1_D0() #0 {
 entry:
   br i1 undef, label %for.inc.1, label %do.body.preheader
 
 do.body.preheader:                                ; preds = %entry
-  %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
+  %tmp = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
   br i1 undef, label %do.body56.1, label %do.body90
 
 do.body90:                                        ; preds = %do.body56.2, %do.body56.1, %do.body.preheader
-  %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ]
-  %2 = insertelement <4 x i32> %1, i32 undef, i32 2
-  %3 = insertelement <4 x i32> %2, i32 undef, i32 3
+  %tmp1 = phi <4 x i32> [ %tmp6, %do.body56.2 ], [ %tmp5, %do.body56.1 ], [ %tmp, %do.body.preheader ]
+  %tmp2 = insertelement <4 x i32> %tmp1, i32 undef, i32 2
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 undef, i32 3
   br i1 undef, label %do.body124.1, label %do.body.1562.preheader
 
 do.body.1562.preheader:                           ; preds = %do.body124.1, %do.body90
-  %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ]
-  %4 = insertelement <4 x i32> undef, i32 undef, i32 1
+  %storemerge = phi <4 x i32> [ %tmp3, %do.body90 ], [ %tmp7, %do.body124.1 ]
+  %tmp4 = insertelement <4 x i32> undef, i32 undef, i32 1
   br label %for.inc.1
 
 do.body56.1:                                      ; preds = %do.body.preheader
-  %5 = insertelement <4 x i32> %0, i32 undef, i32 1
+  %tmp5 = insertelement <4 x i32> %tmp, i32 undef, i32 1
   %or.cond472.1 = or i1 undef, undef
   br i1 %or.cond472.1, label %do.body56.2, label %do.body90
 
 do.body56.2:                                      ; preds = %do.body56.1
-  %6 = insertelement <4 x i32> %5, i32 undef, i32 1
+  %tmp6 = insertelement <4 x i32> %tmp5, i32 undef, i32 1
   br label %do.body90
 
 do.body124.1:                                     ; preds = %do.body90
-  %7 = insertelement <4 x i32> %3, i32 undef, i32 3
+  %tmp7 = insertelement <4 x i32> %tmp3, i32 undef, i32 3
   br label %do.body.1562.preheader
 
 for.inc.1:                                        ; preds = %do.body.1562.preheader, %entry
@@ -42,8 +40,8 @@ for.inc.1:                                        ; preds = %do.body.1562.prehea
   unreachable
 }
 
-; SI-LABEL: {{^}}foo:
-; SI: s_endpgm
+; GCN-LABEL: {{^}}foo:
+; GCN: s_endpgm
 define amdgpu_ps void @foo() #0 {
 bb:
   br i1 undef, label %bb2, label %bb1
@@ -67,7 +65,7 @@ bb7:                                              ; preds = %bb6
   br label %bb4
 
 bb9:                                              ; preds = %bb2
-  %tmp10 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp10 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp11 = extractelement <4 x float> %tmp10, i32 1
   %tmp12 = extractelement <4 x float> %tmp10, i32 3
   br label %bb14
@@ -78,9 +76,9 @@ bb13:                                             ; preds = %bb2
 bb14:                                             ; preds = %bb27, %bb24, %bb9
   %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ]
   %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ]
-  %tmp17 = fmul float 10.5, %tmp16
-  %tmp18 = fmul float 11.5, %tmp15
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17)
+  %tmp17 = fmul float 1.050000e+01, %tmp16
+  %tmp18 = fmul float 1.150000e+01, %tmp15
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp18, float %tmp17, float %tmp17, float %tmp17, i1 true, i1 true) #0
   ret void
 
 bb23:                                             ; preds = %bb13
@@ -97,13 +95,9 @@ bb27:                                             ; preds = %bb24
   br label %bb14
 }
 
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
 
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index 72a1f1e25b30..35615c40d498 100644
--- a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -20,7 +20,7 @@ target triple="amdgcn--"
 ; CHECK-NEXT: s_mov_b32 s6, -1
 ; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
 ; CHECK-NEXT: s_endpgm
-define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
 entry:
   %v0 = insertelement <4 x float> undef, float %a0, i32 0
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
index 8bd995a8ecbb..57c267e54a14 100644
--- a/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
+++ b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
@@ -5,7 +5,7 @@
 ; Just make sure this test doesn't crash.
 ; CHECK-LABEL: foobar:
 ; CHECK: s_endpgm
-define void @foobar() {
+define amdgpu_kernel void @foobar() {
   %v0 = icmp eq <4 x i32> undef, <i32 0, i32 1, i32 2, i32 3>
   %v3 = sext <4 x i1> %v0 to <4 x i32>
   %v4 = extractelement <4 x i32> %v3, i32 1
diff --git a/test/CodeGen/AMDGPU/subreg-intervals.mir b/test/CodeGen/AMDGPU/subreg-intervals.mir
index c4e00215159b..c477fe9bc6d3 100644
--- a/test/CodeGen/AMDGPU/subreg-intervals.mir
+++ b/test/CodeGen/AMDGPU/subreg-intervals.mir
@@ -10,8 +10,8 @@
 # CHECK-LABEL: Machine code for function test1:
 
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
 ...
 ---
 name: test0
diff --git a/test/CodeGen/AMDGPU/subreg_interference.mir b/test/CodeGen/AMDGPU/subreg_interference.mir
new file mode 100644
index 000000000000..24d06a576c2a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/subreg_interference.mir
@@ -0,0 +1,24 @@
+# RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
+---
+# We should not detect any interference between v0/v1 here and only allocate
+# sgpr0-sgpr3.
+#
+# CHECK-LABEL: func0
+# CHECK: S_NOP 0, implicit-def %sgpr0
+# CHECK: S_NOP 0, implicit-def %sgpr3
+# CHECK: S_NOP 0, implicit-def %sgpr1
+# CHECK: S_NOP 0, implicit-def %sgpr2
+# CHECK: S_NOP 0, implicit %sgpr0, implicit %sgpr3
+# CHECK: S_NOP 0, implicit %sgpr1, implicit %sgpr2
+name: func0
+body: |
+  bb.0:
+    S_NOP 0, implicit-def undef %0.sub0 : sreg_128
+    S_NOP 0, implicit-def %0.sub3
+    S_NOP 0, implicit-def undef %1.sub1 : sreg_128
+    S_NOP 0, implicit-def %1.sub2
+
+
+    S_NOP 0, implicit %0.sub0, implicit %0.sub3
+    S_NOP 0, implicit %1.sub1, implicit %1.sub2
+...
diff --git a/test/CodeGen/AMDGPU/target-cpu.ll b/test/CodeGen/AMDGPU/target-cpu.ll
index cf80ff3f4c83..466e89ebee80 100644
--- a/test/CodeGen/AMDGPU/target-cpu.ll
+++ b/test/CodeGen/AMDGPU/target-cpu.ll
@@ -14,7 +14,7 @@ declare void @llvm.amdgcn.s.dcache.wb() #0
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_none() #0 {
+define amdgpu_kernel void @target_none() #0 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -30,7 +30,7 @@ define void @target_none() #0 {
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_tahiti() #1 {
+define amdgpu_kernel void @target_tahiti() #1 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -46,7 +46,7 @@ define void @target_tahiti() #1 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; CHECK: s_dcache_inv_vol
-define void @target_bonaire() #3 {
+define amdgpu_kernel void @target_bonaire() #3 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -63,7 +63,7 @@ define void @target_bonaire() #3 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400
 ; CHECK: flat_store_dword
 ; CHECK: s_dcache_wb{{$}}
-define void @target_fiji() #4 {
+define amdgpu_kernel void @target_fiji() #4 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -79,7 +79,7 @@ define void @target_fiji() #4 {
 ; CHECK-LABEL: {{^}}promote_alloca_enabled:
 ; CHECK: ds_read_b32
 ; CHECK: ; LDSByteSize: 5120
-define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -93,7 +93,7 @@ entry:
 ; CHECK: SCRATCH_RSRC_DWORD0
 ; CHECK: SCRATCH_RSRC_DWORD1
 ; CHECK: ScratchSize: 24
-define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll
index 1555cfe39b1e..77ad895d0e86 100644
--- a/test/CodeGen/AMDGPU/trap.ll
+++ b/test/CodeGen/AMDGPU/trap.ll
@@ -1,13 +1,81 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-TRAP %s
 
-; GCN: warning: <unknown>:0:0: in function trap void (): trap handler not supported
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+
+; enable trap handler feature
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
+
+; disable trap handler feature
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
 
 declare void @llvm.trap() #0
+declare void @llvm.debugtrap() #0
+
+; MESA-TRAP: .section .AMDGPU.config
+; MESA-TRAP:  .long   47180
+; MESA-TRAP-NEXT: .long   208
+
+; NOMESA-TRAP: .section .AMDGPU.config
+; NOMESA-TRAP:  .long   47180
+; NOMESA-TRAP-NEXT: .long   144
+
+; GCN-LABEL: {{^}}hsa_trap:
+; HSA-TRAP: enable_trap_handler = 1
+; HSA-TRAP: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP: s_trap 2
+
+; for llvm.trap in hsa path without ABI, direct generate s_endpgm instruction without any warning information
+; NO-HSA-TRAP: enable_trap_handler = 0
+; NO-HSA-TRAP: s_endpgm
+; NO-HSA-TRAP: COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+
+; TRAP-BIT: enable_trap_handler = 1
+; NO-TRAP-BIT: enable_trap_handler = 0
+; NO-MESA-TRAP: s_endpgm
+define amdgpu_kernel void @hsa_trap() {
+  call void @llvm.trap()
+  ret void
+}
+
+; MESA-TRAP: .section .AMDGPU.config
+; MESA-TRAP:  .long   47180
+; MESA-TRAP-NEXT: .long   208
+
+; NOMESA-TRAP: .section .AMDGPU.config
+; NOMESA-TRAP:  .long   47180
+; NOMESA-TRAP-NEXT: .long   144
+
+; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (): debugtrap handler not supported
+; GCN-LABEL: {{^}}hsa_debugtrap:
+; HSA-TRAP: enable_trap_handler = 1
+; HSA-TRAP: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP: s_trap 3
+
+; for llvm.debugtrap in non-hsa path without ABI, generate a warning and a s_endpgm instruction
+; NO-HSA-TRAP: enable_trap_handler = 0
+; NO-HSA-TRAP: s_endpgm
+
+; TRAP-BIT: enable_trap_handler = 1
+; NO-TRAP-BIT: enable_trap_handler = 0
+; NO-MESA-TRAP: s_endpgm
+define amdgpu_kernel void @hsa_debugtrap() {
+  call void @llvm.debugtrap()
+  ret void
+}
 
+; For non-HSA path
 ; GCN-LABEL: {{^}}trap:
-; GCN: s_endpgm
-; GCN-NEXT: s_endpgm
-define void @trap() {
+; TRAP-BIT: enable_trap_handler = 1
+; NO-TRAP-BIT: enable_trap_handler = 0
+; NO-HSA-TRAP: s_endpgm
+; NO-MESA-TRAP: s_endpgm
+define amdgpu_kernel void @trap() {
   call void @llvm.trap()
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index a331475820a0..f90040385f75 100644
--- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %bc = bitcast <2 x i32> %ld to i64
   %trunc = trunc i64 %bc to i32
@@ -15,7 +15,7 @@ define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace
 ; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   %bc = bitcast <3 x i32> %ld to i96
   %trunc = trunc i96 %bc to i32
@@ -26,7 +26,7 @@ define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace
 ; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %bc = bitcast <4 x i32> %ld to i128
   %trunc = trunc i128 %bc to i32
@@ -38,7 +38,7 @@ define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspac
 ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %bc = bitcast <2 x i16> %ld to i32
   %trunc = trunc i32 %bc to i16
@@ -54,7 +54,7 @@ define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace
 ; SI: buffer_load_dword v[[VAL:[0-9]+]]
 ; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %bc = bitcast <4 x i16> %ld to i64
   %trunc = trunc i64 %bc to i16
@@ -66,7 +66,7 @@ define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace
 ; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8:
 ; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
   %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %bc = bitcast <2 x i8> %ld to i16
   %trunc = trunc i16 %bc to i8
@@ -77,7 +77,7 @@ define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)*
 ; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
   %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %bc = bitcast <4 x i8> %ld to i32
   %trunc = trunc i32 %bc to i8
@@ -88,7 +88,7 @@ define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)
 ; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %bc = bitcast <3 x i8> %ld to i24
   %trunc = trunc i24 %bc to i8
diff --git a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index 7a4bced9d436..cb8d36550331 100644
--- a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI: v_cmp_eq_u32_e32 vcc, 0, [[TMP]]{{$}}
 ; SI: v_cndmask_b32_e64
 ; SI: buffer_store_byte
-define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -25,7 +25,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -36,7 +36,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -48,7 +48,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -60,7 +60,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -71,7 +71,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -84,7 +84,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -96,7 +96,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -107,7 +107,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -122,7 +122,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -137,7 +137,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
 ; XSI: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
 ; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
 ; XSI-NEXT: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -148,7 +148,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -162,7 +162,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
 ; SI: v_cmp_ne_u32_e32 vcc, -1, [[LOAD]]{{$}}
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI: {{buffer|flat}}_store_byte
-define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr i8, i8 addrspace(1)* %in, i32 %tid.x
   %load = load i8, i8 addrspace(1)* %in.ptr
diff --git a/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
index 03b8af0610d7..d67b8f981b28 100644
--- a/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
+++ b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
 ; GCN: s_endpgm
-define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %val = load double, double addrspace(1)* %in
   %cvt = fptrunc double %val to half
   store half %cvt, half addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrsp
 
 ; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in
   %cvt = fptrunc <2 x double> %val to <2 x half>
   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2
 
 ; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %val = load <3 x double>, <3 x double> addrspace(1)* %in
   %cvt = fptrunc <3 x double> %val to <3 x half>
   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3
 
 ; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %val = load <4 x double>, <4 x double> addrspace(1)* %in
   %cvt = fptrunc <4 x double> %val to <4 x half>
   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4
 
 ; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
   %val = load <8 x double>, <8 x double> addrspace(1)* %in
   %cvt = fptrunc <8 x double> %val to <8 x half>
   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8
 
 ; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
   %val = load <16 x double>, <16 x double> addrspace(1)* %in
   %cvt = fptrunc <16 x double> %val to <16 x half>
   store <16 x half> %cvt, <16 x half> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/trunc-store-i1.ll b/test/CodeGen/AMDGPU/trunc-store-i1.ll
index da2a5b43dad5..4ea2352f57f3 100644
--- a/test/CodeGen/AMDGPU/trunc-store-i1.ll
+++ b/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -7,7 +7,7 @@
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -15,7 +15,7 @@ define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwi
 
 ; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
 ; SI: buffer_store_byte
-define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -26,13 +26,13 @@ define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwi
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
+define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 ; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
-define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
+define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
   %add = add i16 %val0, %val1
   %trunc = trunc i16 %add to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
diff --git a/test/CodeGen/AMDGPU/trunc-store.ll b/test/CodeGen/AMDGPU/trunc-store.ll
index c6727e1e1273..f45de679588f 100644
--- a/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/test/CodeGen/AMDGPU/trunc-store.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
   %trunc = trunc <16 x i32> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void
@@ -11,7 +11,7 @@ define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
   %trunc = trunc <16 x i64> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
index 878ea3f48995..3dbc10d2e9b5 100644
--- a/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
+++ b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
@@ -6,7 +6,7 @@
 
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: MEM_RAT_CACHELESS STORE_RAW
-define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+define amdgpu_kernel void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %done
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll
index 2c2ce4c5d351..0c91d52df0c0 100644
--- a/test/CodeGen/AMDGPU/trunc.ll
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
 ; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
 ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1],
 ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
@@ -28,7 +28,7 @@ define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
 ; SI: buffer_store_dword [[VSHL]]
 ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
 
-define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -46,7 +46,7 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
 ; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
 ; GCN: v_mov_b32_e32
 ; GCN: v_mov_b32_e32
-define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
   %b = shl i64 %aa, 2
   %result = trunc i64 %b to i32
@@ -56,9 +56,8 @@ define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64
 }
 
 ; GCN-LABEL: {{^}}trunc_i32_to_i1:
-; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; GCN: v_cmp_eq_u32
-define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
+define amdgpu_kernel void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
   %a = load i32, i32 addrspace(1)* %ptr, align 4
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
@@ -67,9 +66,8 @@ define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
 }
 
 ; GCN-LABEL: {{^}}trunc_i8_to_i1:
-; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; GCN: v_cmp_eq_u32
-define void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
+define amdgpu_kernel void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
   %a = load i8, i8 addrspace(1)* %ptr, align 4
   %trunc = trunc i8 %a to i1
   %result = select i1 %trunc, i8 1, i8 0
@@ -78,9 +76,8 @@ define void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
 }
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1:
-; GCN: s_and_b32 s{{[0-9]+}}, 1, s{{[0-9]+}}
-; GCN: v_cmp_eq_u32
-define void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
   %trunc = trunc i16 %a to i1
   %result = select i1 %trunc, i16 1, i16 0
   store i16 %result, i16 addrspace(1)* %out, align 4
@@ -88,9 +85,8 @@ define void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
 }
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1:
-; GCN: s_and_b32 s{{[0-9]+}}, 1, s{{[0-9]+}}
-; GCN: v_cmp_eq_u32
-define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -103,7 +99,7 @@ define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
 ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
 ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
-define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
   store i32 %sel, i32 addrspace(1)* %out
@@ -116,7 +112,7 @@ define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
-define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
index 76c32afc1f21..7c369a312761 100644
--- a/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
+++ b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
@@ -19,7 +19,7 @@
 ; CHECK: store i8 0, i8 addrspace(1)*
 ; CHECK-NOT: store i8 0, i8 addrspace(1)*
 ; CHECK: ret void
-define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
 entry:
   %add = add nsw i32 %b, 4
   %cmp = icmp sgt i32 %add, %a
diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll
index 35af7119a300..632ccaa7e612 100644
--- a/test/CodeGen/AMDGPU/uaddo.ll
+++ b/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,19 +1,16 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
 
-declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
-declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
-
-; FUNC-LABEL: {{^}}uaddo_i64_zext:
-; SI: add
-; SI: addc
-; SI: addc
+; FUNC-LABEL: {{^}}s_uaddo_i64_zext:
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; GCN: v_cmp_lt_u64_e32 vcc
 
 ; EG: ADDC_UINT
 ; EG: ADDC_UINT
-define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
   %ext = zext i1 %carry to i64
@@ -22,13 +19,16 @@ define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
+; FIXME: Could do scalar
+
 ; FUNC-LABEL: {{^}}s_uaddo_i32:
-; SI: s_add_i32
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
-  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -37,14 +37,19 @@ define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}v_uaddo_i32:
-; SI: v_add_i32
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -52,14 +57,36 @@ define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_uaddo_i32_novcc:
+; GCN: v_add_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]]
+
+; EG: ADDC_UINT
+; EG: ADD_INT
+define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %uadd, 0
+  %carry = extractvalue { i32, i1 } %uadd, 1
+  store volatile i32 %val, i32 addrspace(1)* %out, align 4
+  call void asm sideeffect "", "~{VCC}"() #0
+  store volatile i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}s_uaddo_i64:
-; SI: s_add_u32
-; SI: s_addc_u32
+; GCN: s_add_u32
+; GCN: s_addc_u32
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
-  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -68,18 +95,48 @@ define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 }
 
 ; FUNC-LABEL: {{^}}v_uaddo_i64:
-; SI: v_add_i32
-; SI: v_addc_u32
+; GCN: v_add_i32
+; GCN: v_addc_u32
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64, i64 addrspace(1)* %aptr, align 4
-  %b = load i64, i64 addrspace(1)* %bptr, align 4
-  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
+  %b.gep = getelementptr inbounds i64, i64 addrspace(1)* %b.ptr
+  %a = load i64, i64 addrspace(1)* %a.gep
+  %b = load i64, i64 addrspace(1)* %b.gep
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i64 %val, i64 addrspace(1)* %out
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_uaddo_i16:
+; VI: v_add_u16_e32
+; VI: v_cmp_lt_u16_e32
+define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
+  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr
+  %a = load i16, i16 addrspace(1)* %a.gep
+  %b = load i16, i16 addrspace(1)* %b.gep
+  %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
+  %val = extractvalue { i16, i1 } %uadd, 0
+  %carry = extractvalue { i16, i1 } %uadd, 1
+  store i16 %val, i16 addrspace(1)* %out
   store i1 %carry, i1 addrspace(1)* %carryout
   ret void
 }
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
index da88d2a8e8cb..2874a0cdbc05 100644
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -1,22 +1,27 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}udiv_i32:
 ; EG-NOT: SETGE_INT
 ; EG: CF_END
-define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+
+; SI: v_rcp_iflag_f32_e32
+define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1) * %in
-  %b = load i32, i32 addrspace(1) * %b_ptr
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = udiv i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_udiv_i32:
-
-define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+; SI: v_rcp_iflag_f32_e32
+define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = udiv i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -30,8 +35,10 @@ define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; FUNC-LABEL: {{^}}udiv_v2i32:
 ; EG: CF_END
 
+; SI: v_rcp_iflag_f32_e32
+; SI: v_rcp_iflag_f32_e32
 ; SI: s_endpgm
-define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -43,7 +50,7 @@ define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; FUNC-LABEL: {{^}}udiv_v4i32:
 ; EG: CF_END
 ; SI: s_endpgm
-define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -56,7 +63,7 @@ define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 16
@@ -70,7 +77,7 @@ define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259182
@@ -84,7 +91,7 @@ define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259183
@@ -96,7 +103,7 @@ define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -110,7 +117,7 @@ define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in
   %den = load i16, i16 addrspace(1) * %den_ptr
@@ -124,7 +131,7 @@ define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
   %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
   %num = load i23, i23 addrspace(1) * %in
   %den = load i23, i23 addrspace(1) * %den_ptr
@@ -136,7 +143,7 @@ define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}v_udiv_i24:
 ; SI-NOT: v_rcp_f32
-define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
   %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
   %num = load i24, i24 addrspace(1) * %in
   %den = load i24, i24 addrspace(1) * %den_ptr
@@ -152,9 +159,42 @@ define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
 ; SI: v_mul_hi_u32
 ; SI: v_mul_hi_u32
 
-define void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
+
+; FUNC-LABEL: {{^}}test_udiv2:
+; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+define amdgpu_kernel void @test_udiv2(i32 %p) {
+  %i = udiv i32 %p, 2
+  store volatile i32 %i, i32 addrspace(1)* undef
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_udiv_3_mulhu:
+; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
+; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
+; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
+   %i = udiv i32 %p, 3
+   store volatile i32 %i, i32 addrspace(1)* undef
+   ret void
+}
+
+; GCN-LABEL: {{^}}fdiv_test_denormals
+; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
+bb:
+  %tmp = load i8, i8 addrspace(1)* null, align 1
+  %tmp1 = sext i8 %tmp to i32
+  %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
+  %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = sdiv i32 %tmp1, %tmp4
+  %tmp6 = trunc i32 %tmp5 to i8
+  store i8 %tmp6, i8 addrspace(1)* null, align 1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll
index 17f4ebf175d9..9507a49cfc8b 100644
--- a/test/CodeGen/AMDGPU/udivrem.ll
+++ b/test/CodeGen/AMDGPU/udivrem.ll
@@ -51,7 +51,7 @@
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
+define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out0
   %result1 = urem i32 %x, %y
@@ -158,7 +158,7 @@ define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
   %result0 = udiv <2 x i32> %x, %y
   store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
   %result1 = urem <2 x i32> %x, %y
@@ -340,7 +340,7 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
   %result0 = udiv <4 x i32> %x, %y
   store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
   %result1 = urem <4 x i32> %x, %y
diff --git a/test/CodeGen/AMDGPU/udivrem24.ll b/test/CodeGen/AMDGPU/udivrem24.ll
index 6d145f1dbf09..6f144dcc6fd2 100644
--- a/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/test/CodeGen/AMDGPU/udivrem24.ll
@@ -12,7 +12,7 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -31,7 +31,7 @@ define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -50,7 +50,7 @@ define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -67,7 +67,7 @@ define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -84,7 +84,7 @@ define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -101,7 +101,7 @@ define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -121,7 +121,7 @@ define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -141,7 +141,7 @@ define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -161,7 +161,7 @@ define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -184,7 +184,7 @@ define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -203,7 +203,7 @@ define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -215,7 +215,7 @@ define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}urem24_i32:
 ; SI-NOT: v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -235,7 +235,7 @@ define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -255,7 +255,7 @@ define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -275,7 +275,7 @@ define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -294,7 +294,7 @@ define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -313,7 +313,7 @@ define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
diff --git a/test/CodeGen/AMDGPU/udivrem64.ll b/test/CodeGen/AMDGPU/udivrem64.ll
index da61a841ff35..bd297920d563 100644
--- a/test/CodeGen/AMDGPU/udivrem64.ll
+++ b/test/CodeGen/AMDGPU/udivrem64.ll
@@ -70,7 +70,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@ define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = udiv i64 %1, %2
@@ -176,7 +176,7 @@ define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = urem i64 %1, %2
@@ -195,7 +195,7 @@ define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = udiv i64 %1, %2
@@ -214,7 +214,7 @@ define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = urem i64 %1, %2
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index a4e18ebc9120..62943aeefbd8 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
@@ -19,21 +19,21 @@ define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64
-define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %cast = uitofp i64 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64
-define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
   %cast = uitofp <2 x i64> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64
-define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
   %cast = uitofp <4 x i64> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -42,7 +42,7 @@ define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
 ; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   %cast = uitofp i32 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
@@ -52,7 +52,7 @@ define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
   %cast = uitofp <2 x i32> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -64,7 +64,7 @@ define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
   %cast = uitofp <4 x i32> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -79,7 +79,7 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to double
   store double %fp, double addrspace(1)* %out, align 4
@@ -91,7 +91,7 @@ define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
 ; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index cd816b27fce6..4168326e14c6 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -4,7 +4,7 @@
 ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f16:
-define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to half
   store half %result, half addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]]
 ; GCN: {{buffer|flat}}_store_short {{.*}}[[VR_F16]]
-define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -35,7 +35,7 @@ define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32:
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@ define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]
-define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -65,14 +65,14 @@ define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f32:
-define void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f32:
-define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -83,14 +83,14 @@ define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i6
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f16:
-define void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f16:
-define void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll
index 3003226ca1a4..2e9918717c3a 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -6,7 +6,7 @@
 ; SI: v_cvt_f32_u32_e32
 
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = uitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cvt_f32_u32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
 ; R600: INT_TO_FLT
-define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -32,7 +32,7 @@ define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)*
 
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i3
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i3
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -81,7 +81,7 @@ define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrsp
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = uitofp i1 %in to float
   store float %fp, float addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@ define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
-define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -126,7 +126,7 @@ define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)*
 ; R600-DAG: SETGT_UINT
 ; R600-DAG: SETE_INT
 
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 entry:
   %cvt = uitofp i64 %in to float
   store float %cvt, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/uitofp.f16.ll b/test/CodeGen/AMDGPU/uitofp.f16.ll
index faab5ca5db73..0c3b0fcaf854 100644
--- a/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -8,7 +8,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i16_to_f16(
+define amdgpu_kernel void @uitofp_i16_to_f16(
     half addrspace(1)* %r,
     i16 addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i32_to_f16(
+define amdgpu_kernel void @uitofp_i32_to_f16(
     half addrspace(1)* %r,
     i32 addrspace(1)* %a) {
 entry:
@@ -38,18 +38,23 @@ entry:
 
 ; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16
 ; GCN:     buffer_load_dword
-; SI:      v_cvt_f32_u32_e32
-; SI:      v_cvt_f32_u32_e32
-; VI:      v_cvt_f32_i32_e32
-; VI:      v_cvt_f32_i32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN-DAG: v_and_b32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
-; GCN:     buffer_store_dword
-; GCN:     s_endpgm
-define void @uitofp_v2i16_to_v2f16(
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI:     v_or_b32_e32
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a) {
 entry:
@@ -61,16 +66,23 @@ entry:
 
 ; GCN-LABEL: {{^}}uitofp_v2i32_to_v2f16
 ; GCN:     buffer_load_dwordx2
-; GCN:     v_cvt_f32_u32_e32
-; GCN:     v_cvt_f32_u32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN-DAG: v_and_b32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f32_u32_e32
+; VI-DAG: v_cvt_f32_u32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI:     v_or_b32_e32
+
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @uitofp_v2i32_to_v2f16(
+define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i32> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll
index a2e485d36225..5a579f3575fd 100644
--- a/test/CodeGen/AMDGPU/umed3.ll
+++ b/test/CodeGen/AMDGPU/umed3.ll
@@ -1,12 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
 ; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -24,8 +25,8 @@ define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
 ; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32:
 ; GCN: v_max_u32
 ; GCN: v_min_u32
-define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -44,8 +45,8 @@ define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -63,8 +64,8 @@ define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -82,8 +83,8 @@ define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64:
 ; GCN: v_cmp_lt_u64
 ; GCN: v_cmp_gt_u64
-define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
   %a = load i64, i64 addrspace(1)* %gep0
@@ -99,9 +100,10 @@ define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 }
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
-; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define amdgpu_kernel void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
   %a = load i16, i16 addrspace(1)* %gep0
@@ -171,7 +173,7 @@ define internal i8 @umax8(i8 %x, i8 %y) #2 {
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -183,7 +185,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -195,7 +197,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -207,7 +209,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -219,7 +221,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -231,7 +233,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -243,7 +245,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -255,7 +257,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -267,7 +269,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -279,7 +281,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -291,7 +293,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -303,7 +305,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -315,7 +317,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -327,7 +329,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -339,7 +341,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -351,7 +353,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -366,7 +368,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
 bb:
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
@@ -381,7 +383,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
 bb:
   %tmp0 = call i8 @umin8(i8 %x, i8 %y)
   %tmp1 = call i8 @umax8(i8 %x, i8 %y)
@@ -393,7 +395,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_0:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -406,7 +408,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_1:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -419,7 +421,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_2:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -432,7 +434,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -445,7 +447,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 1, i32 %y)
   %tmp1 = call i32 @umax(i32 1, i32 %y)
@@ -457,7 +459,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 2)
   %tmp1 = call i32 @umax(i32 %x, i32 2)
@@ -469,7 +471,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9
-define void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -479,6 +481,35 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_umed3_i16_pat_0:
+; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; FIXME: VI not matching med3
+; VI: v_min_u16
+; VI: v_max_u16
+; VI: v_min_u16
+; VI: v_max_u16
+
+; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %x = load i16, i16 addrspace(1)* %gep0
+  %y = load i16, i16 addrspace(1)* %gep1
+  %z = load i16, i16 addrspace(1)* %gep2
+
+  %tmp0 = call i16 @umin16(i16 %x, i16 %y)
+  %tmp1 = call i16 @umax16(i16 %x, i16 %y)
+  %tmp2 = call i16 @umin16(i16 %tmp1, i16 %z)
+  %tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2)
+  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/unaligned-load-store.ll b/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 0f76a54975e6..68aacd084bf9 100644
--- a/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -8,7 +8,7 @@
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
   %v = load i16, i16 addrspace(3)* %p, align 1
   store i16 %v, i16 addrspace(3)* %r, align 1
   ret void
@@ -23,7 +23,7 @@ define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(
 ; UNALIGNED: buffer_load_ushort
 ; UNALIGNED: buffer_store_short
 ; SI: s_endpgm
-define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
   %v = load i16, i16 addrspace(1)* %p, align 1
   store i16 %v, i16 addrspace(1)* %r, align 1
   ret void
@@ -42,7 +42,7 @@ define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
@@ -60,7 +60,7 @@ define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 1
   ret void
@@ -74,7 +74,7 @@ define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 2
   ret void
@@ -85,7 +85,7 @@ define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)
 ; GCN: ds_read_u16
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 2
   store i32 %v, i32 addrspace(3)* %r, align 2
   ret void
@@ -132,7 +132,7 @@ define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)*
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
   %v = load i64, i64 addrspace(3)* %p, align 1
   store i64 %v, i64 addrspace(3)* %r, align 1
   ret void
@@ -179,7 +179,7 @@ define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
   ret void
@@ -209,7 +209,7 @@ define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 2
   ret void
@@ -239,7 +239,7 @@ define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 1
   store i64 %v, i64 addrspace(1)* %r, align 1
   ret void
@@ -286,7 +286,7 @@ define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
@@ -329,7 +329,7 @@ define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i
 
 ; UNALIGNED: buffer_load_dwordx4
 ; UNALIGNED: buffer_store_dwordx4
-define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
   ret void
@@ -337,7 +337,7 @@ define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4:
 ; GCN: ds_read2_b32
-define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -345,7 +345,7 @@ define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrsp
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
-define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
   %val = load i64, i64 addrspace(3)* %ptr, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -356,7 +356,7 @@ define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
 ; GCN: s_endpgm
-define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -375,7 +375,7 @@ define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocaptur
 ; GCN: ds_read_u8
 ; GCN: ds_read_u8
 ; GCN: store_dwordx2
-define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 1
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -383,7 +383,7 @@ define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrsp
 
 ; FUNC-LABEL: {{^}}local_store_i64_align_4:
 ; GCN: ds_write2_b32
-define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
   store i64 %val, i64 addrspace(3)* %out, align 4
   ret void
 }
@@ -391,7 +391,7 @@ define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
 ; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
   store i64 0, i64 addrspace(3)* %ptr, align 4
   ret void
@@ -401,7 +401,7 @@ define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
   %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -418,7 +418,7 @@ define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #
 ; UNALIGNED: s_load_dword
 
 ; SI: buffer_store_dword
-define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -430,7 +430,7 @@ define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)*
 
 ; UNALIGNED: s_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -444,7 +444,7 @@ define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r
 
 ; UNALIGNED: s_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -453,7 +453,7 @@ define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r
 ; SI-LABEL: {{^}}constant_align4_load_i64:
 ; SI: s_load_dwordx2
 ; SI: buffer_store_dwordx2
-define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 4
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -462,7 +462,7 @@ define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r
 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
 ; SI: s_load_dwordx4
 ; SI: buffer_store_dwordx4
-define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -482,7 +482,7 @@ define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> ad
 ; UNALIGNED: buffer_load_dwordx2
 
 ; SI: buffer_store_dwordx2
-define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
   ret void
@@ -512,7 +512,7 @@ define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32>
 ; UNALIGNED: buffer_load_dwordx4
 
 ; SI: buffer_store_dwordx4
-define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -521,7 +521,7 @@ define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32>
 ; SI-LABEL: {{^}}constant_align4_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 4
   store i8 %v, i8 addrspace(1)* %r, align 4
   ret void
@@ -530,7 +530,7 @@ define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #
 ; SI-LABEL: {{^}}constant_align2_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 2
   store i8 %v, i8 addrspace(1)* %r, align 2
   ret void
@@ -541,7 +541,7 @@ define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1
   %v0 = load i32, i32 addrspace(2)* %p, align 4
   %v1 = load i32, i32 addrspace(2)* %gep0, align 4
@@ -571,7 +571,7 @@ define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspac
 ; SI: ds_read_u8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
   ret void
@@ -596,7 +596,7 @@ define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> add
 ; SI: ds_write_b8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
   store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 4902e9a3cafb..3e80fcf85b52 100644
--- a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -4,7 +4,7 @@
 
 
 ; CHECK-LABEL: {{^}}func:
-define void @func() #0 {
+define amdgpu_kernel void @func() #0 {
 B0:
   br i1 undef, label %B1, label %B2
 
@@ -35,7 +35,8 @@ bb:
   %tmp1 = load volatile i32, i32 addrspace(1)* undef, align 4
   %tmp2 = insertelement <4 x i32> undef, i32 %tmp1, i32 0
   %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
-  %tmp4 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp3, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp3.cast = bitcast <4 x i32> %tmp3 to <4 x float>
+  %tmp4 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp3.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp5 = extractelement <4 x float> %tmp4, i32 0
   %tmp6 = fmul float %tmp5, undef
   %tmp7 = fadd float %tmp6, %tmp6
@@ -71,7 +72,7 @@ bb11:                                             ; preds = %bb9
 ; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6
 
 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
-define void @partially_undef_copy() #0 {
+define amdgpu_kernel void @partially_undef_copy() #0 {
   %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"()
   %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"()
 
@@ -83,8 +84,7 @@ define void @partially_undef_copy() #0 {
   ret void
 }
 
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare float @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
index d96ee6d21ce8..60ab7631a101 100644
--- a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
+++ b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -5,7 +5,7 @@
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
 ; COMMON-LABEL: {{^}}branch_true:
-define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 true, label %for.end, label %for.body.lr.ph
 
@@ -42,7 +42,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; SI: s_cbranch_vccnz
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
 
@@ -79,7 +79,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; SI: s_cbranch_scc1
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph
 
diff --git a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
index 93a2c6998be4..eb6007f21c10 100644
--- a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
+++ b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
@@ -14,6 +14,7 @@ main_body:
 
 if:
   %u = fadd float %v, %v
+  call void asm sideeffect "", ""() #0 ; Prevent ifconversion
   br label %else
 
 else:
diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll
index 154ac361e797..a9d45d71fa2e 100644
--- a/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=verde -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}uniform_if_scc:
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
@@ -12,7 +12,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -40,7 +40,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %if, label %else
@@ -68,7 +68,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -96,7 +96,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %else, label %if
@@ -123,7 +123,7 @@ done:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -148,7 +148,7 @@ endif:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -166,7 +166,7 @@ endif:
 
 ; GCN-LABEL: {{^}}uniform_if_else_ret:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[TWO]]
@@ -176,7 +176,7 @@ endif:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_endpgm
-define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
+define amdgpu_kernel void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -209,7 +209,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
 ; GCN: buffer_store_dword [[THREE]]
 ; GCN: s_endpgm
-define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
+define amdgpu_kernel void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -233,7 +233,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; GCN: buffer_store_dword
 ; GCN: [[LABEL]]:
 ; GCN: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -252,11 +252,13 @@ ENDIF:                                            ; preds = %IF, %main_body
 ; GCN: s_cmp_lt_i32 [[COND]], 1
 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
 ; GCN: v_cmp_gt_i32_e64 vcc, [[COND]], 0{{$}}
-; GCN: s_cbranch_vccnz [[EXIT]]
-; GCN: buffer_store
+; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]]
 ; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
-define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
+; GCN: {{^}}[[BODY]]:
+; GCN: buffer_store
+; GCN: s_endpgm
+define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp sgt i32 %cond0, 0
@@ -282,7 +284,7 @@ bb9:                                              ; preds = %bb8, %bb4
 ; SI: s_cmp_lg_u32 [[I]], 0
 ; SI: s_cbranch_scc1 [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
 entry:
   br label %loop
 
@@ -302,12 +304,13 @@ done:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
-; GCN: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
 ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0
-; GCN: s_cbranch_scc1 [[ENDIF_LABEL]]
+; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]]
+; GCN: s_endpgm
+; GCN: {{^}}[[IF_UNIFORM_LABEL]]:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp ult i32 %tid, 16
@@ -328,15 +331,14 @@ endif:
 
 ; GCN-LABEL: {{^}}divergent_inside_uniform:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+; GCN: [[IF_LABEL]]:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-; GCN: [[ENDIF_LABEL]]:
-; GCN: s_endpgm
-define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %u_cmp = icmp eq i32 %cond, 0
   br i1 %u_cmp, label %if, label %endif
@@ -363,12 +365,12 @@ endif:
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_or_b64 exec, exec, [[MASK]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
+; GCN: s_cbranch_scc0 [[IF_UNIFORM:[A-Z0-9_]+]]
+; GCN: s_endpgm
+; GCN: [[IF_UNIFORM]]:
 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[TWO]]
-; GCN: [[EXIT]]:
-; GCN: s_endpgm
-define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp eq i32 %tid, 0
@@ -408,7 +410,7 @@ exit:
 
 ; GCN: BB[[FNNUM]]_3:
 ; GCN: s_endpgm
-define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp sgt i32 %cond, 0
@@ -443,7 +445,7 @@ bb9:                                              ; preds = %bb8, %bb4
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -475,7 +477,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -503,7 +505,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp sgt i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -522,7 +524,7 @@ done:
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_eq:
 ; GCN: v_cmp_eq_u64_e32
-define void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -541,7 +543,7 @@ done:
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_ne:
 ; GCN: v_cmp_ne_u64_e32
-define void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/uniform-crash.ll b/test/CodeGen/AMDGPU/uniform-crash.ll
index cfbb2af58677..028199ef9de7 100644
--- a/test/CodeGen/AMDGPU/uniform-crash.ll
+++ b/test/CodeGen/AMDGPU/uniform-crash.ll
@@ -6,7 +6,7 @@
 ; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
 ; GCN: [[LABEL]]:
 ; GCN-NEXT: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -25,7 +25,7 @@ ENDIF:                                            ; preds = %IF, %main_body
 ; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]:
 ; GCN: s_cbranch_scc1 [[LOOP]]
 ; GCN: {{^}}[[BB0]]:
-define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
+define amdgpu_kernel void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
 bb:
   %cnd = trunc i32 %arg to i1
   br i1 %cnd, label %bb2, label %bb5
diff --git a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
index 2c3a09818860..8a08f9d8bb0d 100644
--- a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -7,11 +7,11 @@
 ; CHECK: s_and_saveexec_b64
 ; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
-
+; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}}
 ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader
 
 ; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]:
-; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]]
+; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]]
 
 ; CHECK: s_endpgm
 define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) {
@@ -38,7 +38,7 @@ out:
 ; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
 ; CHECK-NEXT: s_cbranch_execz
-define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 main_body:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %cc = icmp eq i32 %tid, 0
diff --git a/test/CodeGen/AMDGPU/unify-metadata.ll b/test/CodeGen/AMDGPU/unify-metadata.ll
index 9549b08ffee1..d96583e71f13 100644
--- a/test/CodeGen/AMDGPU/unify-metadata.ll
+++ b/test/CodeGen/AMDGPU/unify-metadata.ll
@@ -14,10 +14,6 @@
 ; ALL-DAG: ![[USED_EXT_1]] = !{!"cl_khr_fp16"}
 ; ALL-DAG: ![[USED_EXT_2]] = !{!"cl_doubles"}
 
-define void @test() {
-   ret void
-}
-
 !opencl.ocl.version = !{!1, !0, !0, !0}
 !llvm.ident = !{!2, !2, !2, !2, !6}
 !opencl.used.extensions = !{!3, !3, !4, !5}
diff --git a/test/CodeGen/AMDGPU/unigine-liveness-crash.ll b/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
index 732790ceb335..853131baed5e 100644
--- a/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
+++ b/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
@@ -1,5 +1,4 @@
-; RUN: llc -march=amdgcn < %s | FileCheck %s
-; REQUIRES: asserts
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 ;
 ; This test used to crash with the following assertion:
 ; llc: include/llvm/ADT/IntervalMap.h:632: unsigned int llvm::IntervalMapImpl::LeafNode<llvm::SlotIndex, llvm::LiveInterval *, 8, llvm::IntervalMapInfo<llvm::SlotIndex> >::insertFrom(unsigned int &, unsigned int, KeyT, KeyT, ValT) [KeyT = llvm::SlotIndex, ValT = llvm::LiveInterval *, N = 8, Traits = llvm::IntervalMapInfo<llvm::SlotIndex>]: Assertion `(i == Size || Traits::stopLess(b, start(i))) && "Overlapping insert"' failed.
@@ -10,31 +9,33 @@
 ;
 ; Check for a valid output.
 ; CHECK: image_sample_c
-
-target triple = "amdgcn--"
-
-@ddxy_lds = external addrspace(3) global [64 x i32]
-
 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 {
 main_body:
-  %tmp = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg6, <2 x i32> %arg8)
-  %tmp23 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %i.i = extractelement <2 x i32> %arg8, i32 0
+  %j.i = extractelement <2 x i32> %arg8, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 3, i32 4, i32 %arg6) #2
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 3, i32 4, i32 %arg6) #2
+  %tmp23 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+
   %tmp24 = extractelement <4 x float> %tmp23, i32 3
   %tmp25 = fmul float %tmp24, undef
-  %tmp26 = fmul float undef, %tmp
+  %tmp26 = fmul float undef, %p2.i
   %tmp27 = fadd float %tmp26, undef
   %tmp28 = bitcast float %tmp27 to i32
   %tmp29 = insertelement <4 x i32> undef, i32 %tmp28, i32 0
   %tmp30 = insertelement <4 x i32> %tmp29, i32 0, i32 1
   %tmp31 = insertelement <4 x i32> %tmp30, i32 undef, i32 2
-  %tmp32 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp31, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp31.cast = bitcast <4 x i32> %tmp31 to <4 x float>
+  %tmp32 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp31.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp33 = extractelement <4 x float> %tmp32, i32 0
   %tmp34 = fadd float undef, %tmp33
   %tmp35 = fadd float %tmp34, undef
   %tmp36 = fadd float %tmp35, undef
   %tmp37 = fadd float %tmp36, undef
   %tmp38 = fadd float %tmp37, undef
-  %tmp39 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp39 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp40 = extractelement <4 x float> %tmp39, i32 0
   %tmp41 = extractelement <4 x float> %tmp39, i32 1
   %tmp42 = extractelement <4 x float> %tmp39, i32 2
@@ -51,7 +52,8 @@ main_body:
   %tmp53 = insertelement <4 x i32> undef, i32 %tmp50, i32 0
   %tmp54 = insertelement <4 x i32> %tmp53, i32 %tmp51, i32 1
   %tmp55 = insertelement <4 x i32> %tmp54, i32 %tmp52, i32 2
-  %tmp56 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp55, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp55.cast = bitcast <4 x i32> %tmp55 to <4 x float>
+  %tmp56 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp55.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp57 = extractelement <4 x float> %tmp56, i32 0
   %tmp58 = fadd float %tmp38, %tmp57
   %tmp59 = fadd float undef, %tmp46
@@ -60,7 +62,8 @@ main_body:
   %tmp62 = bitcast float %tmp60 to i32
   %tmp63 = insertelement <4 x i32> undef, i32 %tmp61, i32 1
   %tmp64 = insertelement <4 x i32> %tmp63, i32 %tmp62, i32 2
-  %tmp65 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp64, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp64.cast = bitcast <4 x i32> %tmp64 to <4 x float>
+  %tmp65 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp64.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp66 = extractelement <4 x float> %tmp65, i32 0
   %tmp67 = fadd float %tmp58, %tmp66
   %tmp68 = fmul float %tmp67, 1.250000e-01
@@ -76,8 +79,9 @@ IF26:                                             ; preds = %main_body
 ENDIF25:                                          ; preds = %IF29, %main_body
   %.4 = phi float [ %tmp84, %IF29 ], [ %tmp68, %main_body ]
   %tmp73 = fadd float %.4, undef
-  %tmp74 = call float @llvm.AMDGPU.clamp.(float %tmp73, float 0.000000e+00, float 1.000000e+00)
-  %tmp75 = fmul float undef, %tmp74
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp73, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %tmp75 = fmul float undef, %clamp.i
   %tmp76 = fmul float %tmp75, undef
   %tmp77 = fadd float %tmp76, undef
   %tmp78 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp77, 11
@@ -99,17 +103,22 @@ IF29:                                             ; preds = %LOOP
 ENDIF28:                                          ; preds = %LOOP
   %tmp85 = insertelement <4 x i32> %tmp72, i32 undef, i32 1
   %tmp86 = insertelement <4 x i32> %tmp85, i32 undef, i32 2
-  %tmp87 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp86, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp86.cast = bitcast <4 x i32> %tmp86 to <4 x float>
+  %tmp87 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp86.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp88 = extractelement <4 x float> %tmp87, i32 0
   %tmp89 = fadd float undef, %tmp88
   br label %LOOP
 }
 
-declare float @llvm.AMDGPU.clamp.(float, float, float) #1
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
-attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
+attributes #0 = { nounwind "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll
index 941f4c601e34..25a700a943d2 100644
--- a/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -13,7 +13,7 @@
 ; GCN: ScratchSize: 8{{$}}
 
 ; R600: MOV
-define void @foo() {
+define amdgpu_kernel void @foo() {
   %alloca = alloca i32, align 4
   store volatile i32 0, i32* %alloca
   ret void
diff --git a/test/CodeGen/AMDGPU/unroll.ll b/test/CodeGen/AMDGPU/unroll.ll
index 411a15a4b839..2ce4de90a02d 100644
--- a/test/CodeGen/AMDGPU/unroll.ll
+++ b/test/CodeGen/AMDGPU/unroll.ll
@@ -6,10 +6,10 @@
 ; private memory.  We want to make sure these kinds of loops are always
 ; unrolled, because private memory is slow.
 
-; CHECK-LABEL: @test
+; CHECK-LABEL: @private_memory
 ; CHECK-NOT: alloca
 ; CHECK: store i32 5, i32 addrspace(1)* %out
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out) {
 entry:
   %0 = alloca [32 x i32]
   br label %loop.header
@@ -34,3 +34,67 @@ exit:
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
+
+; Check that loop is unrolled for local memory references
+
+; CHECK-LABEL: @local_memory
+; CHECK: getelementptr i32, i32 addrspace(1)* %out, i32 128
+; CHECK-NEXT: store
+; CHECK-NEXT: ret
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %counter = phi i32 [0, %entry], [%inc, %loop.inc]
+  br label %loop.body
+
+loop.body:
+  %ptr_lds = getelementptr i32, i32 addrspace(3)* %lds, i32 %counter
+  %val = load i32, i32 addrspace(3)* %ptr_lds
+  %ptr_out = getelementptr i32, i32 addrspace(1)* %out, i32 %counter
+  store i32 %val, i32 addrspace(1)* %ptr_out
+  br label %loop.inc
+
+loop.inc:
+  %inc = add i32 %counter, 1
+  %cond = icmp sge i32 %counter, 128
+  br i1 %cond, label  %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+; Check that a loop with if inside completely unrolled to eliminate phi and if
+
+; CHECK-LABEL: @unroll_for_if
+; CHECK: entry:
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: store
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: store
+; CHECK-NOT: br
+define amdgpu_kernel void @unroll_for_if(i32* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i1 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %and = and i32 %i1, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %0 = sext i32 %i1 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %0
+  store i32 0, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i1, 1
+  %cmp = icmp ult i32 %inc, 48
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll
index d120111a71fb..68e91e8c9c6b 100644
--- a/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -6,7 +6,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @slt(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp slt i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -18,7 +18,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ult i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -31,7 +31,7 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ult_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -43,7 +43,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ult_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -55,7 +55,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @olt(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @olt(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -67,7 +67,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @sle(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sle i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -79,7 +79,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ule i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -92,7 +92,7 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ule_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -104,7 +104,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ule_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -116,7 +116,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
-define void @ole(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ole(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
diff --git a/test/CodeGen/AMDGPU/urecip.ll b/test/CodeGen/AMDGPU/urecip.ll
deleted file mode 100644
index d58d2dc2d963..000000000000
--- a/test/CodeGen/AMDGPU/urecip.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK: v_rcp_iflag_f32_e32
-
-define void @test(i32 %p, i32 %q) {
-   %i = udiv i32 %p, %q
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll
index 9e2cfa34e0b9..fd7f8fa2efab 100644
--- a/test/CodeGen/AMDGPU/urem.ll
+++ b/test/CodeGen/AMDGPU/urem.ll
@@ -9,7 +9,7 @@
 ; FUNC-LABEL: {{^}}test_urem_i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -26,7 +26,7 @@ define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_sub_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = urem i32 %num, 7
   store i32 %result, i32 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}test_urem_v2i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -48,7 +48,7 @@ define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1
 ; FUNC-LABEL: {{^}}test_urem_v4i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -60,7 +60,7 @@ define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1
 ; FUNC-LABEL: {{^}}test_urem_i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -72,7 +72,7 @@ define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}test_urem_v2i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -84,7 +84,7 @@ define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}test_urem_v4i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 82bdc261b112..f8e6b7edfe35 100644
--- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -11,7 +11,7 @@ declare float @llvm.amdgcn.div.fixup.f32(float, float, float) #1
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
   %dbl = fadd float %a, %a
   store float %dbl, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a)
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -58,7 +58,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
   %va0 = load volatile float, float addrspace(1)* %in
   %va1 = load volatile float, float addrspace(1)* %in
   %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1
@@ -76,7 +76,7 @@ define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -90,7 +90,7 @@ define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, floa
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -100,7 +100,7 @@ define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, floa
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -110,7 +110,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, fl
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -121,7 +121,7 @@ define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, fl
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
   %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -132,7 +132,7 @@ define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, fl
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, f
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT0]]
-define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@ define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, f
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -171,7 +171,7 @@ define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -186,7 +186,7 @@ define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, f
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -199,7 +199,7 @@ define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -214,7 +214,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, f
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -234,7 +234,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -259,7 +259,7 @@ define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
-define void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
   %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
   store volatile double %fma0, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll
index 3c9b1622a076..d1f454f0bc65 100644
--- a/test/CodeGen/AMDGPU/usubo.ll
+++ b/test/CodeGen/AMDGPU/usubo.ll
@@ -1,16 +1,16 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
 
-declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
-declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
-
-; FUNC-LABEL: {{^}}usubo_i64_zext:
+; FUNC-LABEL: {{^}}s_usubo_i64_zext:
+; GCN: s_sub_u32
+; GCN: s_subb_u32
+; GCN: v_cmp_gt_u64_e32 vcc
 
 ; EG: SUBB_UINT
 ; EG: ADDC_UINT
-define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+define amdgpu_kernel void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
   %ext = zext i1 %carry to i64
@@ -19,13 +19,16 @@ define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
+; FIXME: Could do scalar
+
 ; FUNC-LABEL: {{^}}s_usubo_i32:
-; SI: s_sub_i32
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
-  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
+define amdgpu_kernel void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -34,14 +37,19 @@ define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}v_usubo_i32:
-; SI: v_subrev_i32_e32
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
-  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
+define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -49,16 +57,38 @@ define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_usubo_i32_novcc:
+; GCN: v_sub_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]]
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+define amdgpu_kernel void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %uadd = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %uadd, 0
+  %carry = extractvalue { i32, i1 } %uadd, 1
+  store volatile i32 %val, i32 addrspace(1)* %out, align 4
+  call void asm sideeffect "", "~{VCC}"() #0
+  store volatile i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}s_usubo_i64:
-; SI: s_sub_u32
-; SI: s_subb_u32
+; GCN: s_sub_u32
+; GCN: s_subb_u32
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
-  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+define amdgpu_kernel void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -67,20 +97,50 @@ define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 }
 
 ; FUNC-LABEL: {{^}}v_usubo_i64:
-; SI: v_sub_i32
-; SI: v_subb_u32
+; GCN: v_sub_i32
+; GCN: v_subb_u32
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64, i64 addrspace(1)* %aptr, align 4
-  %b = load i64, i64 addrspace(1)* %bptr, align 4
-  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
+  %b.gep = getelementptr inbounds i64, i64 addrspace(1)* %b.ptr
+  %a = load i64, i64 addrspace(1)* %a.gep
+  %b = load i64, i64 addrspace(1)* %b.gep
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
   store i64 %val, i64 addrspace(1)* %out, align 8
   store i1 %carry, i1 addrspace(1)* %carryout
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_usubo_i16:
+; VI: v_subrev_u16_e32
+; VI: v_cmp_gt_u16_e32
+define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
+  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr
+  %a = load i16, i16 addrspace(1)* %a.gep
+  %b = load i16, i16 addrspace(1)* %b.gep
+  %usub = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
+  %val = extractvalue { i16, i1 } %usub, 0
+  %carry = extractvalue { i16, i1 } %usub, 1
+  store i16 %val, i16 addrspace(1)* %out
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare { i16, i1 } @llvm.usub.with.overflow.i16(i16, i16) #1
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
+declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
index a48e7acd4cf3..b7d766aa395e 100644
--- a/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
+++ b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}kernel_arg_i64:
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; i64 arg works, v1i64 arg does not.
 ; CHECK-LABEL: {{^}}kernel_arg_v1i64:
-define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll
index 1cd49feb0d88..d4a68a418ee4 100644
--- a/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -4,12 +4,12 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; GCN-LABEL: {{^}}v_cnd_nan_nosgpr:
-; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0
-; GCN: v_cndmask_b32_e32 v{{[0-9]}}, -1, v{{[0-9]+}}, vcc
+; GCN: v_cmp_eq_u32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]]
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
   %f = load float, float addrspace(1)* %f.gep
@@ -30,7 +30,7 @@ define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
   store float %select, float addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -62,7 +62,7 @@ define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -77,7 +77,7 @@ define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -92,7 +92,7 @@ define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -105,9 +105,9 @@ define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32:
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
 ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
-; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[Z]], vcc
-define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
+; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]]
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -122,9 +122,9 @@ define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32:
 ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
-; GCN: v_cmp_nlg_f32_e64 vcc, [[X]], 0
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
-define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
+; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -142,7 +142,7 @@ define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -159,7 +159,7 @@ define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float a
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -178,7 +178,7 @@ define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float a
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc
-define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -203,7 +203,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrs
 ; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s
-define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -226,7 +226,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrs
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -249,7 +249,7 @@ define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out,
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -275,7 +275,7 @@ define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out,
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -298,7 +298,7 @@ define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out,
 ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
 ; GCN: store_byte
-define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -321,7 +321,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspa
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -343,7 +343,7 @@ define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, flo
 ; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -364,7 +364,7 @@ define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float
 
 ; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc
-define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -386,7 +386,7 @@ define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32
 ; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
diff --git a/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll b/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
index 9246ce38dbed..2cda52a8438a 100644
--- a/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
+++ b/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.cvt.pk.u8.f32(float, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_0:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 0, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg)
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_1:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg)
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_2:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg)
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_3:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 3, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -40,7 +40,7 @@ define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg)
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result0 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   %result1 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %result0) #0
   %result2 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %result1) #0
@@ -51,7 +51,7 @@ define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %re
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 %idx, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index 9a2dc743d6c9..2b96f7d50076 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}mac_vvv:
 ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
@@ -7,7 +8,7 @@
 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
 ; GCN: buffer_store_dword [[C]]
-define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -25,7 +26,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5
-define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
 entry:
   %tmp0 = fmul float 0.5, %in
   %tmp1 = fadd float %tmp0, 0.5
@@ -36,7 +37,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_vvs:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
+define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
 
@@ -51,7 +52,7 @@ entry:
 
 ; GCN-LABEL: {{^}}mac_ssv:
 ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
+define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
 entry:
   %c = load float, float addrspace(1)* %in
 
@@ -64,7 +65,7 @@ entry:
 ; GCN-LABEL: {{^}}mac_mad_same_add:
 ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
-define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -95,7 +96,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -112,10 +113,10 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}unsafe_mad_sub0_src0:
+; GCN-LABEL: {{^}}nsz_mad_sub0_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @unsafe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -135,7 +136,7 @@ entry:
 ; GCN-LABEL: {{^}}safe_mad_sub0_src0:
 ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
 ; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
-define void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -155,7 +156,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -172,10 +173,10 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}unsafe_mad_sub0_src1:
+; GCN-LABEL: {{^}}nsz_mad_sub0_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @unsafe_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -195,7 +196,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src2:
 ; GCN-NOT: v_mac
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
-define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -221,7 +222,7 @@ entry:
 
 ; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -247,12 +248,16 @@ bb:
 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]]
 
-; FIXME: How is this not folded?
-; SI: v_cvt_f32_f16_e32 v{{[0-9]+}}, 0x3c00
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[A]]
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]]
 
-; VI: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
-; VI: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
+; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]]
+; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
+; SI: v_mac_f32_e32 v{{[0-9]+}}, 0x41000000, v{{[0-9]+}}
+
+; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
+; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -276,7 +281,7 @@ bb:
 
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
-attributes #0 = { nounwind "unsafe-fp-math"="false" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
+attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll
index 151f2cc9fc73..c45af522ec49 100644
--- a/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-; GCN-LABEL: {{^}}mac_f16
+; GCN-LABEL: {{^}}mac_f16:
 ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
 ; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]]
@@ -14,7 +14,7 @@
 ; VI:  v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
 ; VI:  buffer_store_short v[[C_F16]]
 ; GCN: s_endpgm
-define void @mac_f16(
+define amdgpu_kernel void @mac_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -31,13 +31,14 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_same_add
+; GCN-LABEL: {{^}}mac_f16_same_add:
 ; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; SI:  v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; VI:  v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_same_add(
+define amdgpu_kernel void @mac_f16_same_add(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
@@ -63,13 +64,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_a
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_f16_neg_a:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a(
+define amdgpu_kernel void @mac_f16_neg_a(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -87,13 +91,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_b
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_f16_neg_b:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b(
+define amdgpu_kernel void @mac_f16_neg_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -111,13 +118,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_c
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_f16_neg_c:
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c(
+define amdgpu_kernel void @mac_f16_neg_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -135,14 +145,13 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math
-; SI:  v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; SI:  v_subrev_f32_e32 v[[NEG_A:[0-9]+]], v{{[0-9]+}}, v[[ZERO]]
+; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math:
+; SI:  v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
 ; GCN: s_endpgm
-define void @mac_f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -160,14 +169,13 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math
-; SI:  v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; SI:  v_subrev_f32_e32 v[[NEG_A:[0-9]+]], v{{[0-9]+}}, v[[ZERO]]
+; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math:
+; SI:  v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -185,14 +193,13 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math
-; SI:  v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; SI:  v_subrev_f32_e32 v[[NEG_A:[0-9]+]], v{{[0-9]+}}, v[[ZERO]]
+; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math:
+; SI:  v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -210,13 +217,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
+; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a_unsafe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -234,13 +244,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
+; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b_unsafe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -258,13 +271,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
+; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c_unsafe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -282,33 +298,38 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16
+; GCN-LABEL: {{^}}mac_v2f16:
 ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[C_F32_0]]
-; SI:  v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
-; SI:  v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
+; SI-DAG:  v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
+; SI-DAG:  v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
 ; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; VI:  v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
-; VI:  v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]]
-; VI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-NOT: and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
+; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
+; VI-NOT: and
+; VI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
+
 ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @mac_v2f16(
+define amdgpu_kernel void @mac_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -325,17 +346,19 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_same_add
-; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]]
-; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]]
-; SI:  v_mac_f32_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_mac_f32_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]]
-; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]]
-; VI:  v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_same_add:
+; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-DAG:  v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG:  v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
 ; GCN: s_endpgm
-define void @mac_v2f16_same_add(
+define amdgpu_kernel void @mac_v2f16_same_add(
     <2 x half> addrspace(1)* %r0,
     <2 x half> addrspace(1)* %r1,
     <2 x half> addrspace(1)* %a,
@@ -361,15 +384,18 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_a
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_a:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a(
+define amdgpu_kernel void @mac_v2f16_neg_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -388,14 +414,17 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}mac_v2f16_neg_b
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
+
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b(
+define amdgpu_kernel void @mac_v2f16_neg_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -413,15 +442,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_c
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_c:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT2]]
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT5]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c(
+define amdgpu_kernel void @mac_v2f16_neg_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -439,18 +475,20 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math
-; SI:  v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; SI:  v_subrev_f32_e32 v[[NEG_A0:[0-9]+]], v{{[0-9]+}}, v[[ZERO]]
-; SI:  v_subrev_f32_e32 v[[NEG_A1:[0-9]+]], v{{[0-9]+}}, v[[ZERO]]
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math:
+
+; SI:  v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
+; SI:  v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+
 ; VI:  v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
-; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+; VI-DAG:  v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-DAG:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -468,18 +506,20 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math
-; SI:  v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; SI:  v_subrev_f32_e32 v[[NEG_A0:[0-9]+]], v{{[0-9]+}}, v[[ZERO]]
-; SI:  v_subrev_f32_e32 v[[NEG_A1:[0-9]+]], v{{[0-9]+}}, v[[ZERO]]
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math:
+
+; SI:  v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
+; SI:  v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+
 ; VI:  v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+; VI-DAG:  v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-DAG:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -497,18 +537,20 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math
-; SI:  v_cvt_f32_f16_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; SI:  v_subrev_f32_e32 v[[NEG_A0:[0-9]+]], v{{[0-9]+}}, v[[ZERO]]
-; SI:  v_subrev_f32_e32 v[[NEG_A1:[0-9]+]], v{{[0-9]+}}, v[[ZERO]]
-; SI:  v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math:
+
+; SI:  v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
+; SI:  v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI:  v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG:  v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG:  v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -526,15 +568,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a_unsafe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -552,15 +601,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b_unsafe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -578,15 +634,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c_unsafe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -604,5 +667,5 @@ entry:
   ret void
 }
 
-attributes #0 = {"unsafe-fp-math"="false"}
-attributes #1 = {"unsafe-fp-math"="true"}
+attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
+attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/v_madak_f16.ll b/test/CodeGen/AMDGPU/v_madak_f16.ll
index df220d7a977b..bfb10503aaea 100644
--- a/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}madak_f16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -7,7 +7,7 @@
 ; VI:  v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}}
 ; VI:  buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @madak_f16(
+define amdgpu_kernel void @madak_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -28,7 +28,7 @@ entry:
 ; VI:  v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @madak_f16_use_2(
+define amdgpu_kernel void @madak_f16_use_2(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index e64f8467240a..85a8929ebe58 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -29,7 +29,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
 ; SI-NEXT: ; mask branch
 ;
-define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   switch i32 %tid, label %default [
@@ -64,29 +64,100 @@ end:
   ret void
 }
 
-; SI-LABEL: @simple_test_v_if
+; SI-LABEL: {{^}}simple_test_v_if:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 
-; SI: BB{{[0-9]+_[0-9]+}}:
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
-; SI: s_endpgm
+; SI-NEXT: s_waitcnt
 
-; SI: BB1_2:
+; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
 ; SI: s_endpgm
-define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
-  br i1 %is.0, label %store, label %exit
+  br i1 %is.0, label %then, label %exit
+
+then:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  br label %exit
+
+exit:
+  ret void
+}
+
+; FIXME: It would be better to endpgm in the then block.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
+; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[EXIT]]:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %then, label %exit
+
+then:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  ret void
+
+exit:
+  ret void
+}
+
+; Final block has more than a ret to execute. This was miscompiled
+; before function exit blocks were unified since the endpgm would
+; terminate the then wavefront before reaching the store.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
+; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
+; SI: ds_write_b32
+; SI: s_waitcnt
+
+; SI-NEXT: {{^}}[[FLOW]]:
+; SI-NEXT: s_or_saveexec_b64
+; SI-NEXT: s_xor_b64 exec, exec
+; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
+; SI: s_or_b64 exec, exec
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %then, label %exit
 
-store:
+then:
   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
   store i32 999, i32 addrspace(1)* %gep
   ret void
 
 exit:
+  store volatile i32 7, i32 addrspace(3)* undef
   ret void
 }
 
@@ -101,12 +172,12 @@ exit:
 ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
 ; SI: buffer_load_dword
 ; SI-DAG: buffer_store_dword
-; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100
-; SI: s_cbranch_scc0 [[LABEL_LOOP]]
+; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100
+; SI: s_cbranch_vccz [[LABEL_LOOP]]
 ; SI: [[LABEL_EXIT]]:
 ; SI: s_endpgm
 
-define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
@@ -156,7 +227,7 @@ exit:
 
 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
 ; SI: buffer_store_dword
-; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
+; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
 ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
 
 ; SI: [[LABEL_FLOW]]:
@@ -173,7 +244,7 @@ exit:
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm
 
-define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp4 = sext i32 %tmp to i64
diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 03e473e3a0c0..5e5465800c3a 100644
--- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s
 --- |
 
-  define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     %cmp0 = fcmp oeq float %cond, 0.000000e+00
     br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
@@ -20,7 +20,7 @@
     ret void
   }
 
-  define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
 
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
index 7dcf36f144ac..03cf725601b7 100644
--- a/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -15,7 +15,7 @@
 ; EG: MOV
 ; EG: MOV
 ; EG: MOVA_INT
-define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -44,7 +44,7 @@ entry:
 ; EG: MOV
 ; EG: MOVA_INT
 ; EG: MOVA_INT
-define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -71,7 +71,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}bitcast_gep:
 ; EG: STORE_RAW
-define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -93,7 +93,7 @@ entry:
 ; OPT-LABEL: @vector_read_bitcast_gep(
 ; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
 ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
-define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -121,7 +121,7 @@ entry:
 ; OPT: store float
 ; OPT: store float
 ; OPT: load float
-define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*
diff --git a/test/CodeGen/AMDGPU/vector-extract-insert.ll b/test/CodeGen/AMDGPU/vector-extract-insert.ll
index 2d39f82e2499..ab2bfcfd1fb7 100644
--- a/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -30,7 +30,7 @@ define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32>
 ; GCN: v_movreld_b32
 ; GCN: v_movrels_b32
 ; GCN: buffer_store_dword v
-define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
+define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -49,7 +49,7 @@ define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -68,7 +68,7 @@ define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> ad
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext
diff --git a/test/CodeGen/AMDGPU/vectorize-global-local.ll b/test/CodeGen/AMDGPU/vectorize-global-local.ll
new file mode 100644
index 000000000000..90cf34e609f6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vectorize-global-local.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; CHECK-DAG: flat_load_dwordx4
+; CHECK-DAG: flat_load_dwordx4
+; CHECK-DAG: flat_load_dwordx4
+; CHECK-DAG: flat_load_dwordx4
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+
+define amdgpu_kernel void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) {
+bb:
+  %tmp = load i32, i32 addrspace(1)* %arg, align 4
+  store i32 %tmp, i32 addrspace(3)* %arg1, align 4
+  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %tmp3 = load i32, i32 addrspace(1)* %tmp2, align 4
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 1
+  store i32 %tmp3, i32 addrspace(3)* %tmp4, align 4
+  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 2
+  store i32 %tmp6, i32 addrspace(3)* %tmp7, align 4
+  %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+  %tmp9 = load i32, i32 addrspace(1)* %tmp8, align 4
+  %tmp10 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 3
+  store i32 %tmp9, i32 addrspace(3)* %tmp10, align 4
+  %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4
+  %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4
+  %tmp13 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 4
+  store i32 %tmp12, i32 addrspace(3)* %tmp13, align 4
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 5
+  %tmp15 = load i32, i32 addrspace(1)* %tmp14, align 4
+  %tmp16 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 5
+  store i32 %tmp15, i32 addrspace(3)* %tmp16, align 4
+  %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 6
+  %tmp18 = load i32, i32 addrspace(1)* %tmp17, align 4
+  %tmp19 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 6
+  store i32 %tmp18, i32 addrspace(3)* %tmp19, align 4
+  %tmp20 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 7
+  %tmp21 = load i32, i32 addrspace(1)* %tmp20, align 4
+  %tmp22 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 7
+  store i32 %tmp21, i32 addrspace(3)* %tmp22, align 4
+  %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8
+  %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4
+  %tmp25 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 8
+  store i32 %tmp24, i32 addrspace(3)* %tmp25, align 4
+  %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 9
+  %tmp27 = load i32, i32 addrspace(1)* %tmp26, align 4
+  %tmp28 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 9
+  store i32 %tmp27, i32 addrspace(3)* %tmp28, align 4
+  %tmp29 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 10
+  %tmp30 = load i32, i32 addrspace(1)* %tmp29, align 4
+  %tmp31 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 10
+  store i32 %tmp30, i32 addrspace(3)* %tmp31, align 4
+  %tmp32 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 11
+  %tmp33 = load i32, i32 addrspace(1)* %tmp32, align 4
+  %tmp34 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 11
+  store i32 %tmp33, i32 addrspace(3)* %tmp34, align 4
+  %tmp35 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12
+  %tmp36 = load i32, i32 addrspace(1)* %tmp35, align 4
+  %tmp37 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 12
+  store i32 %tmp36, i32 addrspace(3)* %tmp37, align 4
+  %tmp38 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 13
+  %tmp39 = load i32, i32 addrspace(1)* %tmp38, align 4
+  %tmp40 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 13
+  store i32 %tmp39, i32 addrspace(3)* %tmp40, align 4
+  %tmp41 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 14
+  %tmp42 = load i32, i32 addrspace(1)* %tmp41, align 4
+  %tmp43 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 14
+  store i32 %tmp42, i32 addrspace(3)* %tmp43, align 4
+  %tmp44 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 15
+  %tmp45 = load i32, i32 addrspace(1)* %tmp44, align 4
+  %tmp46 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 15
+  store i32 %tmp45, i32 addrspace(3)* %tmp46, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
index 3d71062f1fba..46a1c87184d1 100644
--- a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
+++ b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
@@ -6,7 +6,7 @@
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %v = load i32, i32 addrspace(1)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x00,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %v, <4 x i32> addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
+define amdgpu_kernel void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
   %v = load i32, i32 addrspace(7)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
 
 @t = internal addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3]
 
-define void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
   %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @t, i32 0, i32 %in
   %v = load i32, i32 addrspace(2)* %a
   store i32 %v, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index a8908f87fbf6..e82e548f23cd 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
 ; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
@@ -15,16 +16,17 @@
 
 ; HSA: enable_sgpr_private_segment_buffer = 1
 ; HSA: enable_sgpr_flat_scratch_init = 0
-; HSA: workitem_private_segment_byte_size = 1024
+; HSA: workitem_private_segment_byte_size = 1536
 
 ; GCN-NOT: flat_scr
 
 ; GCNMESA-DAG: s_mov_b32 s16, s3
 ; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCNMESA--DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCNMESA-DAG: s_mov_b32 s14, -1
 ; SIMESA-DAG: s_mov_b32 s15, 0xe8f000
 ; VIMESA-DAG: s_mov_b32 s15, 0xe80000
+; GFX9MESA-DAG: s_mov_b32 s15, 0xe00000
 
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
@@ -40,10 +42,10 @@
 ; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
 
 ; GCN: NumVgprs: 256
-; GCN: ScratchSize: 1024
+; GCN: ScratchSize: 1536
 
 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
-define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+define amdgpu_kernel void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
 bb:
   %tmp = add i32 %arg1, %arg2
   %tmp7 = extractelement <4 x float> %arg6, i32 0
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 4de35b97aeab..c9c8583d5e87 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; This ends up using all 255 registers and requires register
 ; scavenging which will fail to find an unsued register.
@@ -12,19 +13,19 @@
 
 ; GCN-LABEL: {{^}}main:
 
-; GCN-DAG: s_mov_b32 s11, s12
-; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-DAG: s_mov_b32 s14, -1
-; SI-DAG: s_mov_b32 s15, 0xe8f000
-; VI-DAG: s_mov_b32 s15, 0xe80000
-
-; s11 is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload
+; GCN-DAG: s_mov_b32 s[[OFFREG:[0-9]+]], s12
+; GCN-DAG: s_mov_b32 s[[DESC0:[0-9]+]], SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
+; SI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe8f000
+; VI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe80000
+; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
 
+; OFFREG is offset system SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
-; GCN: ScratchSize: 1024
+; GCN: ScratchSize: 1536
 
 define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
@@ -36,7 +37,8 @@ bb:
   %tmp15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0
   %tmp16 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp15, align 16, !tbaa !0
   %tmp17 = add i32 %arg5, %arg7
-  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp16, i32 0, i32 %tmp17)
+  %tmp16.cast = bitcast <16 x i8> %tmp16 to <4 x i32>
+  %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i1 false, i1 false)
   %tmp19 = extractelement <4 x float> %tmp18, i32 0
   %tmp20 = extractelement <4 x float> %tmp18, i32 1
   %tmp21 = extractelement <4 x float> %tmp18, i32 2
@@ -180,39 +182,39 @@ bb24:                                             ; preds = %bb157, %bb
   br i1 %tmp155, label %bb156, label %bb157
 
 bb156:                                            ; preds = %bb24
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp12, float %tmp103, float %tmp102, float %tmp101)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 33, i32 0, float %tmp99, float %tmp98, float %tmp97, float %tmp95)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 34, i32 0, float %tmp94, float %tmp93, float %tmp91, float %tmp90)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 35, i32 0, float %tmp89, float %tmp87, float %tmp86, float %tmp85)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 36, i32 0, float %tmp83, float %tmp82, float %tmp81, float %tmp79)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 37, i32 0, float %tmp78, float %tmp77, float %tmp75, float %tmp74)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 38, i32 0, float %tmp73, float %tmp71, float %tmp70, float %tmp69)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 39, i32 0, float %tmp67, float %tmp66, float %tmp65, float %tmp63)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 40, i32 0, float %tmp62, float %tmp61, float %tmp59, float %tmp58)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 41, i32 0, float %tmp57, float %tmp55, float %tmp54, float %tmp53)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 42, i32 0, float %tmp51, float %tmp50, float %tmp49, float %tmp47)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 43, i32 0, float %tmp46, float %tmp45, float %tmp43, float %tmp42)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 44, i32 0, float %tmp41, float %tmp39, float %tmp38, float %tmp37)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 45, i32 0, float %tmp35, float %tmp34, float %tmp33, float %tmp31)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 46, i32 0, float %tmp30, float %tmp29, float %tmp27, float %tmp26)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 47, i32 0, float %tmp25, float %tmp28, float %tmp32, float %tmp36)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 48, i32 0, float %tmp40, float %tmp44, float %tmp48, float %tmp52)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 49, i32 0, float %tmp56, float %tmp60, float %tmp64, float %tmp68)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 50, i32 0, float %tmp72, float %tmp76, float %tmp80, float %tmp84)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 51, i32 0, float %tmp88, float %tmp92, float %tmp96, float %tmp100)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 52, i32 0, float %tmp104, float %tmp105, float %tmp106, float %tmp108)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 53, i32 0, float %tmp109, float %tmp110, float %tmp111, float %tmp112)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 54, i32 0, float %tmp113, float %tmp114, float %tmp115, float %tmp116)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 55, i32 0, float %tmp117, float %tmp118, float %tmp119, float %tmp120)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 56, i32 0, float %tmp121, float %tmp122, float %tmp123, float %tmp124)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 57, i32 0, float %tmp125, float %tmp126, float %tmp127, float %tmp128)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 58, i32 0, float %tmp129, float %tmp130, float %tmp131, float %tmp132)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 59, i32 0, float %tmp133, float %tmp134, float %tmp135, float %tmp136)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 60, i32 0, float %tmp137, float %tmp138, float %tmp139, float %tmp140)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 61, i32 0, float %tmp141, float %tmp142, float %tmp143, float %tmp144)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 62, i32 0, float %tmp145, float %tmp146, float %tmp147, float %tmp148)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float %tmp149, float %tmp150, float %tmp151, float %tmp13)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp12, float %tmp103, float %tmp102, float %tmp101, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %tmp99, float %tmp98, float %tmp97, float %tmp95, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 34, i32 15, float %tmp94, float %tmp93, float %tmp91, float %tmp90, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 35, i32 15, float %tmp89, float %tmp87, float %tmp86, float %tmp85, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 36, i32 15, float %tmp83, float %tmp82, float %tmp81, float %tmp79, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 37, i32 15, float %tmp78, float %tmp77, float %tmp75, float %tmp74, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 38, i32 15, float %tmp73, float %tmp71, float %tmp70, float %tmp69, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 39, i32 15, float %tmp67, float %tmp66, float %tmp65, float %tmp63, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 40, i32 15, float %tmp62, float %tmp61, float %tmp59, float %tmp58, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 41, i32 15, float %tmp57, float %tmp55, float %tmp54, float %tmp53, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 42, i32 15, float %tmp51, float %tmp50, float %tmp49, float %tmp47, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 43, i32 15, float %tmp46, float %tmp45, float %tmp43, float %tmp42, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 44, i32 15, float %tmp41, float %tmp39, float %tmp38, float %tmp37, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 45, i32 15, float %tmp35, float %tmp34, float %tmp33, float %tmp31, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 46, i32 15, float %tmp30, float %tmp29, float %tmp27, float %tmp26, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 47, i32 15, float %tmp25, float %tmp28, float %tmp32, float %tmp36, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 48, i32 15, float %tmp40, float %tmp44, float %tmp48, float %tmp52, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 49, i32 15, float %tmp56, float %tmp60, float %tmp64, float %tmp68, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 50, i32 15, float %tmp72, float %tmp76, float %tmp80, float %tmp84, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 51, i32 15, float %tmp88, float %tmp92, float %tmp96, float %tmp100, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 52, i32 15, float %tmp104, float %tmp105, float %tmp106, float %tmp108, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 53, i32 15, float %tmp109, float %tmp110, float %tmp111, float %tmp112, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 54, i32 15, float %tmp113, float %tmp114, float %tmp115, float %tmp116, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 55, i32 15, float %tmp117, float %tmp118, float %tmp119, float %tmp120, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 56, i32 15, float %tmp121, float %tmp122, float %tmp123, float %tmp124, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 57, i32 15, float %tmp125, float %tmp126, float %tmp127, float %tmp128, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 58, i32 15, float %tmp129, float %tmp130, float %tmp131, float %tmp132, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 59, i32 15, float %tmp133, float %tmp134, float %tmp135, float %tmp136, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 60, i32 15, float %tmp137, float %tmp138, float %tmp139, float %tmp140, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 61, i32 15, float %tmp141, float %tmp142, float %tmp143, float %tmp144, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 62, i32 15, float %tmp145, float %tmp146, float %tmp147, float %tmp148, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float %tmp149, float %tmp150, float %tmp151, float %tmp13, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 true, i1 false) #0
   ret void
 
 bb157:                                            ; preds = %bb24
@@ -483,18 +485,15 @@ bb157:                                            ; preds = %bb24
   br label %bb24
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
index ad7521a3da9b..8d66c346ed5b 100644
--- a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
@@ -1,10 +1,10 @@
 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
-; ERROR: error: :1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget
+; ERROR: error: foo.cl:1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget
 
 declare float @llvm.amdgcn.rsq.legacy(float) #0
 
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -21,4 +21,4 @@ attributes #1 = { nounwind }
 !2 = !{i32 2, !"Dwarf Version", i32 4}
 !3 = !{i32 2, !"Debug Info Version", i32 3}
 !4 = !DILocation(line: 1, column: 42, scope: !5)
-!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0)
+!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, file: !1, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0)
diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll
index ae8ec58270c1..d2708b068eb4 100644
--- a/test/CodeGen/AMDGPU/vop-shrink.ll
+++ b/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -8,7 +8,7 @@
 
 ; ModuleID = 'vop-shrink.ll'
 
-define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+define amdgpu_kernel void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
 entry:
   %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp = icmp eq i32 %cond, 0
@@ -35,7 +35,7 @@ endif:                                            ; preds = %else, %if
 
 ; FUNC-LABEL: {{^}}add_fold:
 ; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
-define void @add_fold(float addrspace(1)* %out) {
+define amdgpu_kernel void @add_fold(float addrspace(1)* %out) {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = uitofp i32 %tmp to float
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
index fe5be7526b19..bb6234729f90 100644
--- a/test/CodeGen/AMDGPU/vselect.ll
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -10,7 +10,7 @@
 ; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e32
 
-define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
+define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
 entry:
   %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
   %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
@@ -28,7 +28,7 @@ entry:
 ;SI: v_cndmask_b32_e64
 ;SI: v_cndmask_b32_e32
 
-define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
   %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
@@ -52,7 +52,7 @@ entry:
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
 
-define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
+define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
 entry:
   %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
   %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
@@ -68,7 +68,7 @@ entry:
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
   %1 = load <4 x float>, <4 x float> addrspace(1)* %in1
diff --git a/test/CodeGen/AMDGPU/vselect64.ll b/test/CodeGen/AMDGPU/vselect64.ll
index ef85ebe7899f..4a0435565161 100644
--- a/test/CodeGen/AMDGPU/vselect64.ll
+++ b/test/CodeGen/AMDGPU/vselect64.ll
@@ -5,7 +5,7 @@
 ; Make sure the vectors aren't being stored on the stack.  We know they are
 ; being stored on the stack if the shaders uses at leat 10 registers.
 ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
-define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+define amdgpu_kernel void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
 entry:
        %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
        %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
diff --git a/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
index 4584d6e25254..4c5eb3d3aa5d 100644
--- a/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
+++ b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
@@ -10,7 +10,7 @@
 ; CHECK-NOT: ALU_POP_AFTER
 ; CHECK: TEX
 ; CHECK-NEXT: POP
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %endif, label %if
diff --git a/test/CodeGen/AMDGPU/vtx-schedule.ll b/test/CodeGen/AMDGPU/vtx-schedule.ll
index 912e258ebb83..c4b619bf168f 100644
--- a/test/CodeGen/AMDGPU/vtx-schedule.ll
+++ b/test/CodeGen/AMDGPU/vtx-schedule.ll
@@ -9,7 +9,7 @@
 ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
 ; CHECK: Fetch clause
 ; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
-define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
+define amdgpu_kernel void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
 entry:
   %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
   %1 = load i32, i32 addrspace(1)* %0
diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll
index 621c582fcefd..623cbeae8da9 100644
--- a/test/CodeGen/AMDGPU/wait.ll
+++ b/test/CodeGen/AMDGPU/wait.ll
@@ -11,26 +11,27 @@
 ; DEFAULT: exp
 ; DEFAULT: s_waitcnt lgkmcnt(0)
 ; DEFAULT: s_endpgm
-define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) {
+define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
   %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
+  %tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32>
+  %tmp11 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i1 false, i1 false)
   %tmp12 = extractelement <4 x float> %tmp11, i32 0
   %tmp13 = extractelement <4 x float> %tmp11, i32 1
   call void @llvm.amdgcn.s.barrier() #1
   %tmp14 = extractelement <4 x float> %tmp11, i32 2
-;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
-  %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
+  %tmp15 = load float, float addrspace(2)* %constptr, align 4
   %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1
   %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0
-  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
+  %tmp17.cast = bitcast <16 x i8> %tmp17 to <4 x i32>
+  %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp17.cast, i32 %arg6, i32 0, i1 false, i1 false)
   %tmp19 = extractelement <4 x float> %tmp18, i32 0
   %tmp20 = extractelement <4 x float> %tmp18, i32 1
   %tmp21 = extractelement <4 x float> %tmp18, i32 2
   %tmp22 = extractelement <4 x float> %tmp18, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15)
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp12, float %tmp13, float %tmp14, float %tmp15, i1 true, i1 false) #0
   ret void
 }
 
@@ -41,45 +42,42 @@ main_body:
 ; ILPMAX: s_load_dwordx4
 ; ILPMAX: s_waitcnt lgkmcnt(0)
 ; ILPMAX: buffer_load
-; ILPMAX: s_waitcnt vmcnt(1)
 ; ILPMAX: s_waitcnt vmcnt(0)
+; ILPMAX: exp pos0
+; ILPMAX-NEXT: exp param0
 ; ILPMAX: s_endpgm
-
-define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
-byval, i32 inreg, i32 inreg, i32, i32, i32, i32) {
+define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 main_body:
-  %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
-  %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0
-  %13 = add i32 %5, %7
-  %14 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %12, i32 0, i32 %13)
-  %15 = extractelement <4 x float> %14, i32 0
-  %16 = extractelement <4 x float> %14, i32 1
-  %17 = extractelement <4 x float> %14, i32 2
-  %18 = extractelement <4 x float> %14, i32 3
-  %19 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1
-  %20 = load <16 x i8>, <16 x i8> addrspace(2)* %19, align 16, !tbaa !0
-  %21 = add i32 %5, %7
-  %22 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %20, i32 0, i32 %21)
-  %23 = extractelement <4 x float> %22, i32 0
-  %24 = extractelement <4 x float> %22, i32 1
-  %25 = extractelement <4 x float> %22, i32 2
-  %26 = extractelement <4 x float> %22, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %15, float %16, float %17, float %18)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %23, float %24, float %25, float %26)
+  %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0
+  %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0
+  %tmp12 = add i32 %arg5, %arg7
+  %tmp11.cast = bitcast <16 x i8> %tmp11 to <4 x i32>
+  %tmp13 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp11.cast, i32 %tmp12, i32 0, i1 false, i1 false)
+  %tmp14 = extractelement <4 x float> %tmp13, i32 0
+  %tmp15 = extractelement <4 x float> %tmp13, i32 1
+  %tmp16 = extractelement <4 x float> %tmp13, i32 2
+  %tmp17 = extractelement <4 x float> %tmp13, i32 3
+  %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 1
+  %tmp19 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp18, align 16, !tbaa !0
+  %tmp20 = add i32 %arg5, %arg7
+  %tmp19.cast = bitcast <16 x i8> %tmp19 to <4 x i32>
+  %tmp21 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp19.cast, i32 %tmp20, i32 0, i1 false, i1 false)
+  %tmp22 = extractelement <4 x float> %tmp21, i32 0
+  %tmp23 = extractelement <4 x float> %tmp21, i32 1
+  %tmp24 = extractelement <4 x float> %tmp21, i32 2
+  %tmp25 = extractelement <4 x float> %tmp21, i32 3
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp14, float %tmp15, float %tmp16, float %tmp17, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp22, float %tmp23, float %tmp24, float %tmp25, i1 true, i1 false) #0
   ret void
 }
 
-
-; Function Attrs: convergent nounwind
 declare void @llvm.amdgcn.s.barrier() #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
+attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
-attributes #2 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/waitcnt-flat.ll b/test/CodeGen/AMDGPU/waitcnt-flat.ll
index d29bae45d8c2..5d86b12da95f 100644
--- a/test/CodeGen/AMDGPU/waitcnt-flat.ll
+++ b/test/CodeGen/AMDGPU/waitcnt-flat.ll
@@ -9,7 +9,7 @@
 ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
 ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
-define void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   store volatile i32 0, i32 addrspace(1)* %out
   %val = load volatile i32, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/waitcnt.mir b/test/CodeGen/AMDGPU/waitcnt.mir
index cb5de6a2419d..38662e83b359 100644
--- a/test/CodeGen/AMDGPU/waitcnt.mir
+++ b/test/CodeGen/AMDGPU/waitcnt.mir
@@ -1,12 +1,21 @@
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waits  %s -o - | FileCheck %s
 
 --- |
-  define void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
+  define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
                                  <4 x i32> addrspace(1)* %global16,
                                  i32 addrspace(4)* %flat4,
                                  <4 x i32> addrspace(4)* %flat16) {
     ret void
   }
+
+  define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
+    ret void
+  }
+
+  define amdgpu_kernel void @single_branch_successor_not_next_block() {
+    ret void
+  }
+
 ...
 ---
 
@@ -21,18 +30,21 @@
 
 # CHECK-LABEL: bb.1:
 # CHECK: FLAT_LOAD_DWORD
+# CHECK: S_WAITCNT 3952
 # CHECK: FLAT_LOAD_DWORDX4
 # The first load has no mem operand, so we should assume it accesses the flat
 # address space.
 # s_waitcnt vmcnt(0) lgkmcnt(0)
-# CHECK-NEXT: S_WAITCNT 112
+# CHECK-NEXT: S_WAITCNT 127
 
 # CHECK-LABEL: bb.2:
 # CHECK: FLAT_LOAD_DWORD
+# CHECK: S_WAITCNT 3952
 # CHECK: FLAT_LOAD_DWORDX4
+
 # One outstand loads access the flat address space.
 # s_waitcnt vmcnt(0) lgkmcnt(0)
-# CHECK-NEXT: S_WAITCNT 112
+# CHECK-NEXT: S_WAITCNT 127
 
 name: flat_zero_waitcnt
 
@@ -57,3 +69,60 @@ body: |
     %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
     S_ENDPGM
 ...
+---
+# There is only a single fallthrough successor block, so there's no
+# need to wait immediately.
+
+# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait
+# CHECK:   %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2
+# CHECK-NOT: S_WAITCNT
+
+# CHECK: bb.1:
+# CHECK-NEXT: V_LSHLREV_B64
+# CHECK-NEXT: S_WAITCNT 112
+# CHECK-NEXT: FLAT_STORE_DWORD
+name: single_fallthrough_successor_no_end_block_wait
+
+body: |
+  bb.0:
+    successors: %bb.1
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+  bb.1:
+    %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
+    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# The block has a single predecessor with a single successor, but it
+# is not the next block so it's non-obvious that the wait is not needed.
+
+
+# CHECK-LABEL: name: single_branch_successor_not_next_block
+# CHECK:   %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2
+# CHECK-NEXT: S_WAITCNT 112
+
+# CHECK: bb.1
+# CHECK-NEXT: FLAT_STORE_DWORD
+# CHECK-NEXT: S_ENDPGM
+
+# CHECK: bb.2:
+# CHECK-NEXT: V_LSHLREV_B64
+# CHECK-NEXT: FLAT_STORE_DWORD
+name: single_branch_successor_not_next_block
+
+body: |
+  bb.0:
+    successors: %bb.2
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
+   S_BRANCH %bb.2
+
+  bb.1:
+    FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+
+  bb.2:
+     %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
+    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll
index 3f7b2b284c53..9f277b2c9a59 100644
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s
 
 ; Check that WQM isn't triggered by image load/store intrinsics.
 ;
@@ -18,16 +18,14 @@ main_body:
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK-NOT: exec
-define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) {
 main_body:
-  %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
   %c.3 = extractelement <4 x i32> %c.2, i32 0
   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
   %data = load float, float addrspace(1)* %gep
-
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef)
-
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1
   ret void
 }
 
@@ -42,9 +40,9 @@ main_body:
 ;CHECK: store
 ;CHECK-NOT: exec
 ;CHECK: .size test3
-define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) {
+define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x float> %c) {
 main_body:
-  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
 
@@ -70,10 +68,9 @@ main_body:
   %c.1 = mul i32 %c, %d
 
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
-
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.1.bc = bitcast i32 %c.1 to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   ret <4 x float> %dtex
 }
 
@@ -101,9 +98,9 @@ main_body:
   br i1 %cmp, label %IF, label %ELSE
 
 IF:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %data.if = extractelement <4 x float> %dtex, i32 0
   br label %END
 
@@ -143,9 +140,9 @@ main_body:
   br i1 %cmp, label %ELSE, label %IF
 
 IF:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %data.if = extractelement <4 x float> %dtex, i32 0
   br label %END
 
@@ -200,7 +197,8 @@ ELSE:
 
 END:
   %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %coord.END.bc = bitcast i32 %coord.END to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   ret <4 x float> %tex
 }
 
@@ -215,13 +213,11 @@ END:
 ;CHECK: image_sample
 ;CHECK: v_cmp
 ;CHECK: store
-define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) {
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
 main_body:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %dtex.1 = extractelement <4 x float> %dtex, i32 0
-
   call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
 
   %cc = fcmp ogt float %dtex.1, 0.0
@@ -254,7 +250,7 @@ END:
 ;CHECK: %END
 ;CHECK: image_sample
 ;CHECK: image_sample
-define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
+define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
 main_body:
   %cond = icmp eq i32 %y, 0
   br i1 %cond, label %IF, label %END
@@ -265,9 +261,8 @@ IF:
   br label %END
 
 END:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   ret <4 x float> %dtex
 }
 
@@ -286,10 +281,9 @@ END:
 ;CHECK: buffer_store_dword
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: image_sample
-define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
 main_body:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %idx.0 = extractelement <2 x i32> %idx, i32 0
   %data.0 = extractelement <2 x float> %data, i32 0
   call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
@@ -299,10 +293,8 @@ main_body:
   %idx.1 = extractelement <2 x i32> %idx, i32 1
   %data.1 = extractelement <2 x float> %data, i32 1
   call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
-
-  %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %out = fadd <4 x float> %tex, %dtex
 
   ret <4 x float> %out
@@ -320,11 +312,10 @@ main_body:
 ; CHECK: buffer_store_dword
 ; CHECK-NOT: wqm
 ; CHECK: v_cmpx_
-define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
 main_body:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
 
   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
 
@@ -375,8 +366,7 @@ loop:
   br i1 %cc, label %break, label %body
 
 body:
-  %c.i = bitcast <4 x float> %c.iv to <4 x i32>
-  %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.next = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c.iv, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %ctr.next = fadd float %ctr.iv, 2.0
   br label %loop
 
@@ -394,7 +384,7 @@ break:
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
 ; CHECK: s_wqm_b64 exec, exec
-; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+$}}
+; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}}
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
 ; CHECK: s_wqm_b64 exec, exec
@@ -416,9 +406,8 @@ entry:
 
   %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
   %c = load i32, i32* %c.gep, align 4
-
-  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %c.bc = bitcast i32 %c to float
+  %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
 
   ret void
@@ -436,9 +425,8 @@ entry:
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 ; CHECK-NOT: exec
 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
-  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.i = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   ret <4 x float> %dtex
 }
 
@@ -450,10 +438,8 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
 ; CHECK-NOT: exec
 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
 entry:
-  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.i = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %cc = icmp sgt i32 %c, 0
   br i1 %cc, label %if, label %else
 
@@ -485,35 +471,29 @@ main_body:
   br i1 %cc, label %if, label %else
 
 if:
-  %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r.if = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float 0.0, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   br label %end
 
 else:
-  %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 0, i32 1>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r.else = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0.0, float bitcast (i32 1 to float)>, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   br label %end
 
 end:
   %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
-
   call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
-
   ret <4 x float> %r
 }
 
-
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
-declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
-
-declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
-
-declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
-
-declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
+declare void @llvm.AMDGPU.kill(float) #1
 
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
index deac809f9b05..b1ee016e99c9 100644
--- a/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
+++ b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
@@ -10,7 +10,7 @@ declare void @llvm.write_register.i32(metadata, i32) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 
-define void @write_vgpr_into_sgpr() {
+define amdgpu_kernel void @write_vgpr_into_sgpr() {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.write_register.i32(metadata !0, i32 %tid)
   ret void
diff --git a/test/CodeGen/AMDGPU/write_register.ll b/test/CodeGen/AMDGPU/write_register.ll
index 88660ba6ec6a..9c62e003dde0 100644
--- a/test/CodeGen/AMDGPU/write_register.ll
+++ b/test/CodeGen/AMDGPU/write_register.ll
@@ -4,7 +4,7 @@ declare void @llvm.write_register.i32(metadata, i32) #0
 declare void @llvm.write_register.i64(metadata, i64) #0
 
 ; CHECK-LABEL: {{^}}test_write_m0:
-define void @test_write_m0(i32 %val) #0 {
+define amdgpu_kernel void @test_write_m0(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !0, i32 0)
   call void @llvm.write_register.i32(metadata !0, i32 -1)
   call void @llvm.write_register.i32(metadata !0, i32 %val)
@@ -15,7 +15,7 @@ define void @test_write_m0(i32 %val) #0 {
 ; CHECK: s_mov_b64 exec, 0
 ; CHECK: s_mov_b64 exec, -1
 ; CHECK: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_exec(i64 %val) #0 {
+define amdgpu_kernel void @test_write_exec(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !1, i64 0)
   call void @llvm.write_register.i64(metadata !1, i64 -1)
   call void @llvm.write_register.i64(metadata !1, i64 %val)
@@ -26,7 +26,7 @@ define void @test_write_exec(i64 %val) #0 {
 ; CHECK: s_mov_b64 flat_scratch, 0
 ; CHECK: s_mov_b64 flat_scratch, -1
 ; CHECK: s_mov_b64 flat_scratch, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_flat_scratch(i64 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !2, i64 0)
   call void @llvm.write_register.i64(metadata !2, i64 -1)
   call void @llvm.write_register.i64(metadata !2, i64 %val)
@@ -36,7 +36,7 @@ define void @test_write_flat_scratch(i64 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_lo:
 ; CHECK: s_mov_b32 flat_scratch_lo, 0
 ; CHECK: s_mov_b32 flat_scratch_lo, s{{[0-9]+}}
-define void @test_write_flat_scratch_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !3, i32 0)
   call void @llvm.write_register.i32(metadata !3, i32 %val)
   ret void
@@ -45,7 +45,7 @@ define void @test_write_flat_scratch_lo(i32 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_hi:
 ; CHECK: s_mov_b32 flat_scratch_hi, 0
 ; CHECK: s_mov_b32 flat_scratch_hi, s{{[0-9]+}}
-define void @test_write_flat_scratch_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !4, i32 0)
   call void @llvm.write_register.i32(metadata !4, i32 %val)
   ret void
@@ -54,7 +54,7 @@ define void @test_write_flat_scratch_hi(i32 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_exec_lo:
 ; CHECK: s_mov_b32 exec_lo, 0
 ; CHECK: s_mov_b32 exec_lo, s{{[0-9]+}}
-define void @test_write_exec_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !5, i32 0)
   call void @llvm.write_register.i32(metadata !5, i32 %val)
   ret void
@@ -63,7 +63,7 @@ define void @test_write_exec_lo(i32 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_exec_hi:
 ; CHECK: s_mov_b32 exec_hi, 0
 ; CHECK: s_mov_b32 exec_hi, s{{[0-9]+}}
-define void @test_write_exec_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !6, i32 0)
   call void @llvm.write_register.i32(metadata !6, i32 %val)
   ret void
diff --git a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
index 7f6b80459047..36532365d871 100644
--- a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
+++ b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -4,7 +4,7 @@
 ;CHECK: {{^}}fill3d:
 ;CHECK-NOT: MULLO_INT T[0-9]+
 
-define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
   %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
diff --git a/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll b/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
index babae9ead27c..88ef9fd93c8f 100644
--- a/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
+++ b/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
@@ -5,7 +5,7 @@
 
 ; TODO: enable doubles
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -14,7 +14,7 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -30,7 +30,7 @@ end:
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index bf02d4c3b311..57a082a0170c 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -10,7 +10,7 @@
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
   %result = xor <2 x i32> %a, %b
@@ -29,7 +29,7 @@ define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
   %result = xor <4 x i32> %a, %b
@@ -46,7 +46,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1) * %in0
   %b = load float, float addrspace(1) * %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -63,7 +63,7 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
   %a = load volatile i1, i1 addrspace(1)* %in0
   %b = load volatile i1, i1 addrspace(1)* %in1
   %xor = xor i1 %a, %b
@@ -73,7 +73,7 @@ define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace
 
 ; FUNC-LABEL: {{^}}vector_xor_i32:
 ; SI: v_xor_b32_e32
-define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, %b
@@ -83,7 +83,7 @@ define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
 
 ; FUNC-LABEL: {{^}}scalar_xor_i32:
 ; SI: s_xor_b32
-define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = xor i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -91,7 +91,7 @@ define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 
 ; FUNC-LABEL: {{^}}scalar_not_i32:
 ; SI: s_not_b32
-define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
   %result = xor i32 %a, -1
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
 
 ; FUNC-LABEL: {{^}}vector_not_i32:
 ; SI: v_not_b32
-define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, -1
@@ -111,7 +111,7 @@ define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
 ; SI: v_xor_b32_e32
 ; SI: v_xor_b32_e32
 ; SI: s_endpgm
-define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, %b
@@ -122,7 +122,7 @@ define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
 ; FUNC-LABEL: {{^}}scalar_xor_i64:
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = xor i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -130,7 +130,7 @@ define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 
 ; FUNC-LABEL: {{^}}scalar_not_i64:
 ; SI: s_not_b64
-define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = xor i64 %a, -1
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +139,7 @@ define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
 ; FUNC-LABEL: {{^}}vector_not_i64:
 ; SI: v_not_b32
 ; SI: v_not_b32
-define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, -1
@@ -153,7 +153,7 @@ define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
 
 ; FUNC-LABEL: {{^}}xor_cf:
 ; SI: s_xor_b64
-define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
+define amdgpu_kernel void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -178,7 +178,7 @@ endif:
 ; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -192,7 +192,7 @@ define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -211,7 +211,7 @@ define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i6
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: xor_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -220,7 +220,7 @@ define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI: s_xor_b64 [[VAL]], [[VAL]], -8
-define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -231,7 +231,7 @@ define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]]
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}}
 ; SI: s_endpgm
-define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -243,7 +243,7 @@ define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll
index 572099617605..f256d89f0cb7 100644
--- a/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/test/CodeGen/AMDGPU/zero_extend.ll
@@ -9,7 +9,7 @@
 ; SI: {{^}}s_mad_zext_i32_to_i64:
 ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
 ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
-define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %tmp0 = mul i32 %a, %b
   %tmp1 = add i32 %tmp0, %c
@@ -20,7 +20,7 @@ entry:
 
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %tmp0 = icmp eq i32 %a, %b
   %tmp1 = zext i1 %tmp0 to i32
@@ -29,7 +29,7 @@ entry:
 }
 
 ; SI-LABEL: {{^}}s_arg_zext_i1_to_i64:
-define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
+define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
   %ext = zext i1 %arg to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
@@ -39,7 +39,7 @@ define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
 ; SI: s_mov_b32 s{{[0-9]+}}, 0
 ; SI: v_cmp_eq_u32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
@@ -49,7 +49,7 @@ define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; SI: buffer_store_short [[RESULT]]
-define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = zext i1 %tmp0 to i16
   store i16 %tmp1, i16 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
index 842c30b40df2..a902234898cd 100644
--- a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
+++ b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -11,7 +11,7 @@
 ; GCN-NOT: v[[HI]]
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64
@@ -31,7 +31,7 @@ define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0,
 ; GCN-NOT: _or_
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64
diff --git a/test/CodeGen/AMDGPU/zext-lid.ll b/test/CodeGen/AMDGPU/zext-lid.ll
new file mode 100644
index 000000000000..8eeff53ff99f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/zext-lid.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s
+
+; CHECK-NOT: and_b32
+
+; OPT-LABEL: @zext_grp_size_128
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !0
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !0
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !0
+define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp1 = and i32 %tmp, 127
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp3 = and i32 %tmp2, 127
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
+  %tmp6 = and i32 %tmp5, 127
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+; OPT-LABEL: @zext_grp_size_32x4x1
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !2
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !3
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !4
+define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp1 = and i32 %tmp, 31
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp3 = and i32 %tmp2, 3
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
+  %tmp6 = and i32 %tmp5, 1
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+; OPT-LABEL: @zext_grp_size_512
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !5
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !5
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !5
+define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp1 = and i32 %tmp, 65535
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp3 = and i32 %tmp2, 65535
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
+  %tmp6 = and i32 %tmp5, 65535
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+declare i32 @llvm.amdgcn.workitem.id.y() #2
+
+declare i32 @llvm.amdgcn.workitem.id.z() #2
+
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,128" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
+attributes #2 = { nounwind readnone }
+
+!0 = !{i32 32, i32 4, i32 1}
+
+; OPT: !0 = !{i32 0, i32 128}
+; OPT: !1 = !{i32 32, i32 4, i32 1}
+; OPT: !2 = !{i32 0, i32 32}
+; OPT: !3 = !{i32 0, i32 4}
+; OPT: !4 = !{i32 0, i32 1}
+; OPT: !5 = !{i32 0, i32 512}